// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- // vim: ts=8 sw=2 smarttab /* * Ceph - scalable distributed file system * * Copyright (C) 2004-2006 Sage Weil * * This is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License version 2.1, as published by the Free Software * Foundation. See file COPYING. * */ #include #include #include #include #include #include "Message.h" #include "Pipe.h" #include "SimpleMessenger.h" #include "common/debug.h" #include "common/errno.h" #define dout_subsys ceph_subsys_ms #undef dout_prefix #define dout_prefix _pipe_prefix(_dout) ostream& Pipe::_pipe_prefix(std::ostream *_dout) { return *_dout << "-- " << msgr->get_myinst().addr << " >> " << peer_addr << " pipe(" << this << " sd=" << sd << " :" << port << " pgs=" << peer_global_seq << " cs=" << connect_seq << " l=" << policy.lossy << ")."; } /************************************** * Pipe */ Pipe::Pipe(SimpleMessenger *r, int st, Connection *con) : reader_thread(this), writer_thread(this), msgr(r), sd(-1), port(0), peer_type(-1), pipe_lock("SimpleMessenger::Pipe::pipe_lock"), state(st), connection_state(NULL), reader_running(false), reader_joining(false), writer_running(false), in_q(r->dispatch_queue.create_queue(this)), keepalive(false), close_on_empty(false), connect_seq(0), peer_global_seq(0), out_seq(0), in_seq(0), in_seq_acked(0) { if (con) { connection_state = con->get(); connection_state->reset_pipe(this); } else { connection_state = new Connection(); connection_state->pipe = get(); } msgr->timeout = msgr->cct->_conf->ms_tcp_read_timeout * 1000; //convert to ms if (msgr->timeout == 0) msgr->timeout = -1; } Pipe::~Pipe() { in_q->put(); assert(out_q.empty()); assert(sent.empty()); if (connection_state) connection_state->put(); } void Pipe::handle_ack(uint64_t seq) { lsubdout(msgr->cct, ms, 15) << "reader got ack seq " << seq << dendl; // trim sent list while (!sent.empty() && sent.front()->get_seq() <= seq) { Message *m = sent.front(); sent.pop_front(); lsubdout(msgr->cct, ms, 10) << "reader got ack seq " << seq << " >= " << m->get_seq() << " on " << m << " " << *m << dendl; m->put(); } if (sent.empty() && close_on_empty) { lsubdout(msgr->cct, ms, 10) << "reader got last ack, queue empty, closing" << dendl; stop(); } } void Pipe::start_reader() { assert(pipe_lock.is_locked()); assert(!reader_running); reader_running = true; reader_thread.create(msgr->cct->_conf->ms_rwthread_stack_bytes); } void Pipe::start_writer() { assert(pipe_lock.is_locked()); assert(!writer_running); writer_running = true; writer_thread.create(msgr->cct->_conf->ms_rwthread_stack_bytes); } void Pipe::join_reader() { if (!reader_running) return; assert(!reader_joining); reader_joining = true; cond.Signal(); pipe_lock.Unlock(); reader_thread.join(); pipe_lock.Lock(); assert(reader_joining); reader_joining = false; } void Pipe::queue_received(Message *m, int priority) { assert(pipe_lock.is_locked()); in_q->queue(m, priority); } int Pipe::accept() { ldout(msgr->cct,10) << "accept" << dendl; // my creater gave me sd via accept() assert(state == STATE_ACCEPTING); // announce myself. int rc = tcp_write(CEPH_BANNER, strlen(CEPH_BANNER)); if (rc < 0) { ldout(msgr->cct,10) << "accept couldn't write banner" << dendl; state = STATE_CLOSED; return -1; } // and my addr bufferlist addrs; ::encode(msgr->my_inst.addr, addrs); port = msgr->my_inst.addr.get_port(); // and peer's socket addr (they might not know their ip) entity_addr_t socket_addr; socklen_t len = sizeof(socket_addr.ss_addr()); int r = ::getpeername(sd, (sockaddr*)&socket_addr.ss_addr(), &len); if (r < 0) { char buf[80]; ldout(msgr->cct,0) << "accept failed to getpeername " << errno << " " << strerror_r(errno, buf, sizeof(buf)) << dendl; state = STATE_CLOSED; return -1; } ::encode(socket_addr, addrs); rc = tcp_write(addrs.c_str(), addrs.length()); if (rc < 0) { ldout(msgr->cct,10) << "accept couldn't write my+peer addr" << dendl; state = STATE_CLOSED; return -1; } ldout(msgr->cct,1) << "accept sd=" << sd << dendl; // identify peer char banner[strlen(CEPH_BANNER)+1]; if (tcp_read(banner, strlen(CEPH_BANNER)) < 0) { ldout(msgr->cct,10) << "accept couldn't read banner" << dendl; state = STATE_CLOSED; return -1; } if (memcmp(banner, CEPH_BANNER, strlen(CEPH_BANNER))) { banner[strlen(CEPH_BANNER)] = 0; ldout(msgr->cct,1) << "accept peer sent bad banner '" << banner << "' (should be '" << CEPH_BANNER << "')" << dendl; state = STATE_CLOSED; return -1; } bufferlist addrbl; { bufferptr tp(sizeof(peer_addr)); addrbl.push_back(tp); } if (tcp_read(addrbl.c_str(), addrbl.length()) < 0) { ldout(msgr->cct,10) << "accept couldn't read peer_addr" << dendl; state = STATE_CLOSED; return -1; } { bufferlist::iterator ti = addrbl.begin(); ::decode(peer_addr, ti); } ldout(msgr->cct,10) << "accept peer addr is " << peer_addr << dendl; if (peer_addr.is_blank_ip()) { // peer apparently doesn't know what ip they have; figure it out for them. int port = peer_addr.get_port(); peer_addr.addr = socket_addr.addr; peer_addr.set_port(port); ldout(msgr->cct,0) << "accept peer addr is really " << peer_addr << " (socket is " << socket_addr << ")" << dendl; } set_peer_addr(peer_addr); // so that connection_state gets set up ceph_msg_connect connect; ceph_msg_connect_reply reply; Pipe *existing = 0; bufferptr bp; bufferlist authorizer, authorizer_reply; bool authorizer_valid; uint64_t feat_missing; bool replaced = false; // this should roughly mirror pseudocode at // http://ceph.newdream.net/wiki/Messaging_protocol int reply_tag = 0; uint64_t existing_seq = -1; while (1) { if (tcp_read((char*)&connect, sizeof(connect)) < 0) { ldout(msgr->cct,10) << "accept couldn't read connect" << dendl; goto fail_unlocked; } authorizer.clear(); if (connect.authorizer_len) { bp = buffer::create(connect.authorizer_len); if (tcp_read(bp.c_str(), connect.authorizer_len) < 0) { ldout(msgr->cct,10) << "accept couldn't read connect authorizer" << dendl; goto fail_unlocked; } authorizer.push_back(bp); authorizer_reply.clear(); } ldout(msgr->cct,20) << "accept got peer connect_seq " << connect.connect_seq << " global_seq " << connect.global_seq << dendl; msgr->lock.Lock(); // FIXME if (msgr->dispatch_queue.stop) goto shutting_down; // note peer's type, flags set_peer_type(connect.host_type); policy = msgr->get_policy(connect.host_type); ldout(msgr->cct,10) << "accept of host_type " << connect.host_type << ", policy.lossy=" << policy.lossy << dendl; memset(&reply, 0, sizeof(reply)); reply.protocol_version = msgr->get_proto_version(peer_type, false); // mismatch? ldout(msgr->cct,10) << "accept my proto " << reply.protocol_version << ", their proto " << connect.protocol_version << dendl; if (connect.protocol_version != reply.protocol_version) { reply.tag = CEPH_MSGR_TAG_BADPROTOVER; msgr->lock.Unlock(); goto reply; } feat_missing = policy.features_required & ~(uint64_t)connect.features; if (feat_missing) { ldout(msgr->cct,1) << "peer missing required features " << std::hex << feat_missing << std::dec << dendl; reply.tag = CEPH_MSGR_TAG_FEATURES; msgr->lock.Unlock(); goto reply; } msgr->lock.Unlock(); if (msgr->verify_authorizer(connection_state, peer_type, connect.authorizer_protocol, authorizer, authorizer_reply, authorizer_valid) && !authorizer_valid) { ldout(msgr->cct,0) << "accept bad authorizer" << dendl; reply.tag = CEPH_MSGR_TAG_BADAUTHORIZER; goto reply; } msgr->lock.Lock(); if (msgr->dispatch_queue.stop) goto shutting_down; // existing? if (msgr->rank_pipe.count(peer_addr)) { existing = msgr->rank_pipe[peer_addr]; existing->pipe_lock.Lock(); if (connect.global_seq < existing->peer_global_seq) { ldout(msgr->cct,10) << "accept existing " << existing << ".gseq " << existing->peer_global_seq << " > " << connect.global_seq << ", RETRY_GLOBAL" << dendl; reply.tag = CEPH_MSGR_TAG_RETRY_GLOBAL; reply.global_seq = existing->peer_global_seq; // so we can send it below.. existing->pipe_lock.Unlock(); msgr->lock.Unlock(); goto reply; } else { ldout(msgr->cct,10) << "accept existing " << existing << ".gseq " << existing->peer_global_seq << " <= " << connect.global_seq << ", looks ok" << dendl; } if (existing->policy.lossy) { ldout(msgr->cct,0) << "accept replacing existing (lossy) channel (new one lossy=" << policy.lossy << ")" << dendl; existing->was_session_reset(); goto replace; } ldout(msgr->cct,0) << "accept connect_seq " << connect.connect_seq << " vs existing " << existing->connect_seq << " state " << existing->get_state_name() << dendl; if (connect.connect_seq == 0 && existing->connect_seq > 0) { ldout(msgr->cct,0) << "accept peer reset, then tried to connect to us, replacing" << dendl; if (policy.resetcheck) existing->was_session_reset(); // this resets out_queue, msg_ and connect_seq #'s goto replace; } if (connect.connect_seq < existing->connect_seq) { // old attempt, or we sent READY but they didn't get it. ldout(msgr->cct,10) << "accept existing " << existing << ".cseq " << existing->connect_seq << " > " << connect.connect_seq << ", RETRY_SESSION" << dendl; goto retry_session; } if (connect.connect_seq == existing->connect_seq) { // if the existing connection successfully opened, and/or // subsequently went to standby, then the peer should bump // their connect_seq and retry: this is not a connection race // we need to resolve here. if (existing->state == STATE_OPEN || existing->state == STATE_STANDBY) { ldout(msgr->cct,10) << "accept connection race, existing " << existing << ".cseq " << existing->connect_seq << " == " << connect.connect_seq << ", OPEN|STANDBY, RETRY_SESSION" << dendl; goto retry_session; } // connection race? if (peer_addr < msgr->my_inst.addr || existing->policy.server) { // incoming wins ldout(msgr->cct,10) << "accept connection race, existing " << existing << ".cseq " << existing->connect_seq << " == " << connect.connect_seq << ", or we are server, replacing my attempt" << dendl; if (!(existing->state == STATE_CONNECTING || existing->state == STATE_WAIT)) lderr(msgr->cct) << "accept race bad state, would replace, existing=" << existing->get_state_name() << " " << existing << ".cseq=" << existing->connect_seq << " == " << connect.connect_seq << dendl; assert(existing->state == STATE_CONNECTING || existing->state == STATE_WAIT); goto replace; } else { // our existing outgoing wins ldout(msgr->cct,10) << "accept connection race, existing " << existing << ".cseq " << existing->connect_seq << " == " << connect.connect_seq << ", sending WAIT" << dendl; assert(peer_addr > msgr->my_inst.addr); if (!(existing->state == STATE_CONNECTING)) lderr(msgr->cct) << "accept race bad state, would send wait, existing=" << existing->get_state_name() << " " << existing << ".cseq=" << existing->connect_seq << " == " << connect.connect_seq << dendl; assert(existing->state == STATE_CONNECTING); // make sure our outgoing connection will follow through existing->_send_keepalive(); reply.tag = CEPH_MSGR_TAG_WAIT; existing->pipe_lock.Unlock(); msgr->lock.Unlock(); goto reply; } } assert(connect.connect_seq > existing->connect_seq); assert(connect.global_seq >= existing->peer_global_seq); if (policy.resetcheck && // RESETSESSION only used by servers; peers do not reset each other existing->connect_seq == 0) { ldout(msgr->cct,0) << "accept we reset (peer sent cseq " << connect.connect_seq << ", " << existing << ".cseq = " << existing->connect_seq << "), sending RESETSESSION" << dendl; reply.tag = CEPH_MSGR_TAG_RESETSESSION; msgr->lock.Unlock(); existing->pipe_lock.Unlock(); goto reply; } // reconnect ldout(msgr->cct,10) << "accept peer sent cseq " << connect.connect_seq << " > " << existing->connect_seq << dendl; goto replace; } // existing else if (policy.resetcheck && connect.connect_seq > 0) { // we reset, and they are opening a new session ldout(msgr->cct,0) << "accept we reset (peer sent cseq " << connect.connect_seq << "), sending RESETSESSION" << dendl; msgr->lock.Unlock(); reply.tag = CEPH_MSGR_TAG_RESETSESSION; goto reply; } else { // new session ldout(msgr->cct,10) << "accept new session" << dendl; existing = NULL; goto open; } assert(0); retry_session: reply.tag = CEPH_MSGR_TAG_RETRY_SESSION; reply.connect_seq = existing->connect_seq + 1; existing->pipe_lock.Unlock(); msgr->lock.Unlock(); goto reply; reply: reply.features = ((uint64_t)connect.features & policy.features_supported) | policy.features_required; reply.authorizer_len = authorizer_reply.length(); rc = tcp_write((char*)&reply, sizeof(reply)); if (rc < 0) goto fail_unlocked; if (reply.authorizer_len) { rc = tcp_write(authorizer_reply.c_str(), authorizer_reply.length()); if (rc < 0) goto fail_unlocked; } } replace: if (connect.features & CEPH_FEATURE_RECONNECT_SEQ) { reply_tag = CEPH_MSGR_TAG_SEQ; existing_seq = existing->in_seq; } ldout(msgr->cct,10) << "accept replacing " << existing << dendl; existing->stop(); existing->unregister_pipe(); replaced = true; if (!existing->policy.lossy) { // drop my Connection, and take a ref to the existing one. do not // clear existing->connection_state, since read_message and // write_message both dereference it without pipe_lock. connection_state->put(); connection_state = existing->connection_state->get(); // make existing Connection reference us existing->connection_state->reset_pipe(this); // steal incoming queue in_seq = existing->in_seq; in_seq_acked = in_seq; in_q->put(); in_q = existing->in_q; in_q->lock.Lock(); in_q->parent = this; in_q->restart_queue(); in_q->lock.Unlock(); existing->in_q = msgr->dispatch_queue.create_queue(existing); // steal outgoing queue and out_seq existing->requeue_sent(); out_seq = existing->out_seq; ldout(msgr->cct,10) << "accept re-queuing on out_seq " << out_seq << " in_seq " << in_seq << dendl; for (map >::iterator p = existing->out_q.begin(); p != existing->out_q.end(); p++) out_q[p->first].splice(out_q[p->first].begin(), p->second); } existing->pipe_lock.Unlock(); open: // open connect_seq = connect.connect_seq + 1; peer_global_seq = connect.global_seq; state = STATE_OPEN; ldout(msgr->cct,10) << "accept success, connect_seq = " << connect_seq << ", sending READY" << dendl; // send READY reply reply.tag = (reply_tag ? reply_tag : CEPH_MSGR_TAG_READY); reply.features = policy.features_supported; reply.global_seq = msgr->get_global_seq(); reply.connect_seq = connect_seq; reply.flags = 0; reply.authorizer_len = authorizer_reply.length(); if (policy.lossy) reply.flags = reply.flags | CEPH_MSG_CONNECT_LOSSY; connection_state->set_features((int)reply.features & (int)connect.features); ldout(msgr->cct,10) << "accept features " << connection_state->get_features() << dendl; // notify msgr->dispatch_queue.queue_accept(connection_state); // ok! if (msgr->dispatch_queue.stop) goto shutting_down; register_pipe(); msgr->lock.Unlock(); rc = tcp_write((char*)&reply, sizeof(reply)); if (rc < 0) { goto fail_unlocked; } if (reply.authorizer_len) { rc = tcp_write(authorizer_reply.c_str(), authorizer_reply.length()); if (rc < 0) { goto fail_unlocked; } } if (reply_tag == CEPH_MSGR_TAG_SEQ) { uint64_t newly_acked_seq = 0; if(tcp_write((char*)&existing_seq, sizeof(existing_seq)) < 0) { ldout(msgr->cct,2) << "accept write error on in_seq" << dendl; goto fail_unlocked; } if (tcp_read((char*)&newly_acked_seq, sizeof(newly_acked_seq)) < 0) { ldout(msgr->cct,2) << "accept read error on newly_acked_seq" << dendl; goto fail_unlocked; } requeue_sent(newly_acked_seq); } pipe_lock.Lock(); if (state != STATE_CLOSED) { ldout(msgr->cct,10) << "accept starting writer, state " << get_state_name() << dendl; start_writer(); } ldout(msgr->cct,20) << "accept done" << dendl; pipe_lock.Unlock(); return 0; // success. fail_unlocked: pipe_lock.Lock(); if (state != STATE_CLOSED) { bool queued = is_queued(); if (queued) state = policy.server ? STATE_STANDBY : STATE_CONNECTING; else if (replaced) state = STATE_STANDBY; else state = STATE_CLOSED; fault(); if (queued || replaced) start_writer(); } pipe_lock.Unlock(); return -1; shutting_down: msgr->lock.Unlock(); pipe_lock.Lock(); state = STATE_CLOSED; fault(); pipe_lock.Unlock(); return -1; } int Pipe::connect() { bool got_bad_auth = false; ldout(msgr->cct,10) << "connect " << connect_seq << dendl; assert(pipe_lock.is_locked()); __u32 cseq = connect_seq; __u32 gseq = msgr->get_global_seq(); // stop reader thrad join_reader(); pipe_lock.Unlock(); char tag = -1; int rc; struct msghdr msg; struct iovec msgvec[2]; int msglen; char banner[strlen(CEPH_BANNER) + 1]; // extra byte makes coverity happy entity_addr_t paddr; entity_addr_t peer_addr_for_me, socket_addr; AuthAuthorizer *authorizer = NULL; bufferlist addrbl, myaddrbl; const md_config_t *conf = msgr->cct->_conf; // close old socket. this is safe because we stopped the reader thread above. if (sd >= 0) ::close(sd); char buf[80]; // create socket? sd = ::socket(peer_addr.get_family(), SOCK_STREAM, 0); if (sd < 0) { lderr(msgr->cct) << "connect couldn't created socket " << strerror_r(errno, buf, sizeof(buf)) << dendl; goto fail; } // connect! ldout(msgr->cct,10) << "connecting to " << peer_addr << dendl; rc = ::connect(sd, (sockaddr*)&peer_addr.addr, peer_addr.addr_size()); if (rc < 0) { ldout(msgr->cct,2) << "connect error " << peer_addr << ", " << errno << ": " << strerror_r(errno, buf, sizeof(buf)) << dendl; goto fail; } // disable Nagle algorithm? if (conf->ms_tcp_nodelay) { int flag = 1; int r = ::setsockopt(sd, IPPROTO_TCP, TCP_NODELAY, (char*)&flag, sizeof(flag)); if (r < 0) ldout(msgr->cct,0) << "connect couldn't set TCP_NODELAY: " << strerror_r(errno, buf, sizeof(buf)) << dendl; } // verify banner // FIXME: this should be non-blocking, or in some other way verify the banner as we get it. if (tcp_read((char*)&banner, strlen(CEPH_BANNER)) < 0) { ldout(msgr->cct,2) << "connect couldn't read banner, " << strerror_r(errno, buf, sizeof(buf)) << dendl; goto fail; } if (memcmp(banner, CEPH_BANNER, strlen(CEPH_BANNER))) { ldout(msgr->cct,0) << "connect protocol error (bad banner) on peer " << paddr << dendl; goto fail; } memset(&msg, 0, sizeof(msg)); msgvec[0].iov_base = banner; msgvec[0].iov_len = strlen(CEPH_BANNER); msg.msg_iov = msgvec; msg.msg_iovlen = 1; msglen = msgvec[0].iov_len; if (do_sendmsg(&msg, msglen)) { ldout(msgr->cct,2) << "connect couldn't write my banner, " << strerror_r(errno, buf, sizeof(buf)) << dendl; goto fail; } // identify peer { bufferptr p(sizeof(paddr) * 2); addrbl.push_back(p); } if (tcp_read(addrbl.c_str(), addrbl.length()) < 0) { ldout(msgr->cct,2) << "connect couldn't read peer addrs, " << strerror_r(errno, buf, sizeof(buf)) << dendl; goto fail; } { bufferlist::iterator p = addrbl.begin(); ::decode(paddr, p); ::decode(peer_addr_for_me, p); port = peer_addr_for_me.get_port(); } ldout(msgr->cct,20) << "connect read peer addr " << paddr << " on socket " << sd << dendl; if (peer_addr != paddr) { if (paddr.is_blank_ip() && peer_addr.get_port() == paddr.get_port() && peer_addr.get_nonce() == paddr.get_nonce()) { ldout(msgr->cct,0) << "connect claims to be " << paddr << " not " << peer_addr << " - presumably this is the same node!" << dendl; } else { ldout(msgr->cct,0) << "connect claims to be " << paddr << " not " << peer_addr << " - wrong node!" << dendl; goto fail; } } ldout(msgr->cct,20) << "connect peer addr for me is " << peer_addr_for_me << dendl; msgr->learned_addr(peer_addr_for_me); ::encode(msgr->my_inst.addr, myaddrbl); memset(&msg, 0, sizeof(msg)); msgvec[0].iov_base = myaddrbl.c_str(); msgvec[0].iov_len = myaddrbl.length(); msg.msg_iov = msgvec; msg.msg_iovlen = 1; msglen = msgvec[0].iov_len; if (do_sendmsg(&msg, msglen)) { ldout(msgr->cct,2) << "connect couldn't write my addr, " << strerror_r(errno, buf, sizeof(buf)) << dendl; goto fail; } ldout(msgr->cct,10) << "connect sent my addr " << msgr->my_inst.addr << dendl; while (1) { delete authorizer; authorizer = msgr->get_authorizer(peer_type, false); bufferlist authorizer_reply; ceph_msg_connect connect; connect.features = policy.features_supported; connect.host_type = msgr->my_type; connect.global_seq = gseq; connect.connect_seq = cseq; connect.protocol_version = msgr->get_proto_version(peer_type, true); connect.authorizer_protocol = authorizer ? authorizer->protocol : 0; connect.authorizer_len = authorizer ? authorizer->bl.length() : 0; if (authorizer) ldout(msgr->cct,10) << "connect.authorizer_len=" << connect.authorizer_len << " protocol=" << connect.authorizer_protocol << dendl; connect.flags = 0; if (policy.lossy) connect.flags |= CEPH_MSG_CONNECT_LOSSY; // this is fyi, actually, server decides! memset(&msg, 0, sizeof(msg)); msgvec[0].iov_base = (char*)&connect; msgvec[0].iov_len = sizeof(connect); msg.msg_iov = msgvec; msg.msg_iovlen = 1; msglen = msgvec[0].iov_len; if (authorizer) { msgvec[1].iov_base = authorizer->bl.c_str(); msgvec[1].iov_len = authorizer->bl.length(); msg.msg_iovlen++; msglen += msgvec[1].iov_len; } ldout(msgr->cct,10) << "connect sending gseq=" << gseq << " cseq=" << cseq << " proto=" << connect.protocol_version << dendl; if (do_sendmsg(&msg, msglen)) { ldout(msgr->cct,2) << "connect couldn't write gseq, cseq, " << strerror_r(errno, buf, sizeof(buf)) << dendl; goto fail; } ldout(msgr->cct,20) << "connect wrote (self +) cseq, waiting for reply" << dendl; ceph_msg_connect_reply reply; if (tcp_read((char*)&reply, sizeof(reply)) < 0) { ldout(msgr->cct,2) << "connect read reply " << strerror_r(errno, buf, sizeof(buf)) << dendl; goto fail; } ldout(msgr->cct,20) << "connect got reply tag " << (int)reply.tag << " connect_seq " << reply.connect_seq << " global_seq " << reply.global_seq << " proto " << reply.protocol_version << " flags " << (int)reply.flags << dendl; authorizer_reply.clear(); if (reply.authorizer_len) { ldout(msgr->cct,10) << "reply.authorizer_len=" << reply.authorizer_len << dendl; bufferptr bp = buffer::create(reply.authorizer_len); if (tcp_read(bp.c_str(), reply.authorizer_len) < 0) { ldout(msgr->cct,10) << "connect couldn't read connect authorizer_reply" << dendl; goto fail; } authorizer_reply.push_back(bp); } if (authorizer) { bufferlist::iterator iter = authorizer_reply.begin(); if (!authorizer->verify_reply(iter)) { ldout(msgr->cct,0) << "failed verifying authorize reply" << dendl; goto fail; } } pipe_lock.Lock(); if (state != STATE_CONNECTING) { ldout(msgr->cct,0) << "connect got RESETSESSION but no longer connecting" << dendl; goto stop_locked; } if (reply.tag == CEPH_MSGR_TAG_FEATURES) { ldout(msgr->cct,0) << "connect protocol feature mismatch, my " << std::hex << connect.features << " < peer " << reply.features << " missing " << (reply.features & ~policy.features_supported) << std::dec << dendl; goto fail_locked; } if (reply.tag == CEPH_MSGR_TAG_BADPROTOVER) { ldout(msgr->cct,0) << "connect protocol version mismatch, my " << connect.protocol_version << " != " << reply.protocol_version << dendl; goto fail_locked; } if (reply.tag == CEPH_MSGR_TAG_BADAUTHORIZER) { ldout(msgr->cct,0) << "connect got BADAUTHORIZER" << dendl; if (got_bad_auth) goto stop_locked; got_bad_auth = true; pipe_lock.Unlock(); authorizer = msgr->get_authorizer(peer_type, true); // try harder continue; } if (reply.tag == CEPH_MSGR_TAG_RESETSESSION) { ldout(msgr->cct,0) << "connect got RESETSESSION" << dendl; was_session_reset(); in_q->restart_queue(); cseq = 0; pipe_lock.Unlock(); continue; } if (reply.tag == CEPH_MSGR_TAG_RETRY_GLOBAL) { gseq = msgr->get_global_seq(reply.global_seq); ldout(msgr->cct,10) << "connect got RETRY_GLOBAL " << reply.global_seq << " chose new " << gseq << dendl; pipe_lock.Unlock(); continue; } if (reply.tag == CEPH_MSGR_TAG_RETRY_SESSION) { assert(reply.connect_seq > connect_seq); ldout(msgr->cct,10) << "connect got RETRY_SESSION " << connect_seq << " -> " << reply.connect_seq << dendl; cseq = connect_seq = reply.connect_seq; pipe_lock.Unlock(); continue; } if (reply.tag == CEPH_MSGR_TAG_WAIT) { ldout(msgr->cct,3) << "connect got WAIT (connection race)" << dendl; state = STATE_WAIT; goto stop_locked; } if (reply.tag == CEPH_MSGR_TAG_READY || reply.tag == CEPH_MSGR_TAG_SEQ) { uint64_t feat_missing = policy.features_required & ~(uint64_t)reply.features; if (feat_missing) { ldout(msgr->cct,1) << "missing required features " << std::hex << feat_missing << std::dec << dendl; goto fail_locked; } if (reply.tag == CEPH_MSGR_TAG_SEQ) { ldout(msgr->cct,10) << "got CEPH_MSGR_TAG_SEQ, reading acked_seq and writing in_seq" << dendl; uint64_t newly_acked_seq = 0; if (tcp_read((char*)&newly_acked_seq, sizeof(newly_acked_seq)) < 0) { ldout(msgr->cct,2) << "connect read error on newly_acked_seq" << dendl; goto fail_locked; } handle_ack(newly_acked_seq); if (tcp_write((char*)&in_seq, sizeof(in_seq)) < 0) { ldout(msgr->cct,2) << "connect write error on in_seq" << dendl; goto fail_locked; } } // hooray! peer_global_seq = reply.global_seq; policy.lossy = reply.flags & CEPH_MSG_CONNECT_LOSSY; state = STATE_OPEN; connect_seq = cseq + 1; assert(connect_seq == reply.connect_seq); backoff = utime_t(); connection_state->set_features((unsigned)reply.features & (unsigned)connect.features); ldout(msgr->cct,10) << "connect success " << connect_seq << ", lossy = " << policy.lossy << ", features " << connection_state->get_features() << dendl; msgr->dispatch_queue.queue_connect(connection_state); if (!reader_running) { ldout(msgr->cct,20) << "connect starting reader" << dendl; start_reader(); } delete authorizer; return 0; } // protocol error ldout(msgr->cct,0) << "connect got bad tag " << (int)tag << dendl; goto fail_locked; } fail: pipe_lock.Lock(); fail_locked: if (state == STATE_CONNECTING) fault(); else ldout(msgr->cct,3) << "connect fault, but state = " << get_state_name() << " != connecting, stopping" << dendl; stop_locked: delete authorizer; return -1; } void Pipe::register_pipe() { ldout(msgr->cct,10) << "register_pipe" << dendl; assert(msgr->lock.is_locked()); assert(msgr->rank_pipe.count(peer_addr) == 0); msgr->rank_pipe[peer_addr] = this; } void Pipe::unregister_pipe() { assert(msgr->lock.is_locked()); if (msgr->rank_pipe.count(peer_addr) && msgr->rank_pipe[peer_addr] == this) { ldout(msgr->cct,10) << "unregister_pipe" << dendl; msgr->rank_pipe.erase(peer_addr); } else { ldout(msgr->cct,10) << "unregister_pipe - not registered" << dendl; } } void Pipe::requeue_sent(uint64_t max_acked) { if (sent.empty()) return; list& rq = out_q[CEPH_MSG_PRIO_HIGHEST]; while (!sent.empty()) { Message *m = sent.back(); if (m->get_seq() > max_acked) { sent.pop_back(); ldout(msgr->cct,10) << "requeue_sent " << *m << " for resend seq " << out_seq << " (" << m->get_seq() << ")" << dendl; rq.push_front(m); out_seq--; } else sent.clear(); } } /* * Tears down the Pipe's message queues, and removes them from the DispatchQueue * Must hold pipe_lock prior to calling. */ void Pipe::discard_out_queue() { ldout(msgr->cct,10) << "discard_queue" << dendl; for (list::iterator p = sent.begin(); p != sent.end(); p++) { if (*p < (void *) DispatchQueue::D_NUM_CODES) { continue; // skip non-Message dispatch codes } ldout(msgr->cct,20) << " discard " << *p << dendl; (*p)->put(); } sent.clear(); for (map >::iterator p = out_q.begin(); p != out_q.end(); p++) for (list::iterator r = p->second.begin(); r != p->second.end(); r++) { if (*r < (void *) DispatchQueue::D_NUM_CODES) { continue; // skip non-Message dispatch codes } ldout(msgr->cct,20) << " discard " << *r << dendl; (*r)->put(); } out_q.clear(); } void Pipe::fault(bool onread) { const md_config_t *conf = msgr->cct->_conf; assert(pipe_lock.is_locked()); cond.Signal(); if (onread && state == STATE_CONNECTING) { ldout(msgr->cct,10) << "fault already connecting, reader shutting down" << dendl; return; } char buf[80]; ldout(msgr->cct,2) << "fault " << errno << ": " << strerror_r(errno, buf, sizeof(buf)) << dendl; if (state == STATE_CLOSED || state == STATE_CLOSING) { ldout(msgr->cct,10) << "fault already closed|closing" << dendl; return; } shutdown_socket(); // lossy channel? if (policy.lossy && state != STATE_CONNECTING) { ldout(msgr->cct,10) << "fault on lossy channel, failing" << dendl; stop(); // ugh pipe_lock.Unlock(); msgr->lock.Lock(); pipe_lock.Lock(); unregister_pipe(); msgr->lock.Unlock(); in_q->discard_queue(); discard_out_queue(); // disconnect from Connection, and mark it failed. future messages // will be dropped. assert(connection_state); connection_state->clear_pipe(this); msgr->dispatch_queue.queue_reset(connection_state); return; } // requeue sent items requeue_sent(); if (policy.standby && !is_queued()) { ldout(msgr->cct,0) << "fault with nothing to send, going to standby" << dendl; state = STATE_STANDBY; return; } if (state != STATE_CONNECTING) { if (policy.server) { ldout(msgr->cct,0) << "fault, server, going to standby" << dendl; state = STATE_STANDBY; } else { ldout(msgr->cct,0) << "fault, initiating reconnect" << dendl; connect_seq++; state = STATE_CONNECTING; } backoff = utime_t(); } else if (backoff == utime_t()) { ldout(msgr->cct,0) << "fault" << dendl; backoff.set_from_double(conf->ms_initial_backoff); } else { ldout(msgr->cct,10) << "fault waiting " << backoff << dendl; cond.WaitInterval(msgr->cct, pipe_lock, backoff); backoff += backoff; if (backoff > conf->ms_max_backoff) backoff.set_from_double(conf->ms_max_backoff); ldout(msgr->cct,10) << "fault done waiting or woke up" << dendl; } } void Pipe::was_session_reset() { assert(pipe_lock.is_locked()); ldout(msgr->cct,10) << "was_session_reset" << dendl; in_q->discard_queue(); discard_out_queue(); msgr->dispatch_queue.queue_remote_reset(connection_state); out_seq = 0; in_seq = 0; connect_seq = 0; } void Pipe::stop() { ldout(msgr->cct,10) << "stop" << dendl; assert(pipe_lock.is_locked()); state = STATE_CLOSED; cond.Signal(); shutdown_socket(); } /* read msgs from socket. * also, server. */ void Pipe::reader() { if (state == STATE_ACCEPTING) accept(); pipe_lock.Lock(); // loop. while (state != STATE_CLOSED && state != STATE_CONNECTING) { assert(pipe_lock.is_locked()); // sleep if (re)connecting if (state == STATE_STANDBY) { ldout(msgr->cct,20) << "reader sleeping during reconnect|standby" << dendl; cond.Wait(pipe_lock); continue; } pipe_lock.Unlock(); char buf[80]; char tag = -1; ldout(msgr->cct,20) << "reader reading tag..." << dendl; if (tcp_read((char*)&tag, 1) < 0) { pipe_lock.Lock(); ldout(msgr->cct,2) << "reader couldn't read tag, " << strerror_r(errno, buf, sizeof(buf)) << dendl; fault(true); continue; } if (tag == CEPH_MSGR_TAG_KEEPALIVE) { ldout(msgr->cct,20) << "reader got KEEPALIVE" << dendl; pipe_lock.Lock(); continue; } // open ... if (tag == CEPH_MSGR_TAG_ACK) { ldout(msgr->cct,20) << "reader got ACK" << dendl; ceph_le64 seq; int rc = tcp_read((char*)&seq, sizeof(seq)); pipe_lock.Lock(); if (rc < 0) { ldout(msgr->cct,2) << "reader couldn't read ack seq, " << strerror_r(errno, buf, sizeof(buf)) << dendl; fault(true); } else if (state != STATE_CLOSED) { handle_ack(seq); } continue; } else if (tag == CEPH_MSGR_TAG_MSG) { ldout(msgr->cct,20) << "reader got MSG" << dendl; Message *m = 0; int r = read_message(&m); pipe_lock.Lock(); if (!m) { if (r < 0) fault(true); continue; } if (state == STATE_CLOSED || state == STATE_CONNECTING) { msgr->dispatch_throttle_release(m->get_dispatch_throttle_size()); m->put(); continue; } // check received seq#. if it is old, drop the message. // note that incoming messages may skip ahead. this is convenient for the client // side queueing because messages can't be renumbered, but the (kernel) client will // occasionally pull a message out of the sent queue to send elsewhere. in that case // it doesn't matter if we "got" it or not. if (m->get_seq() <= in_seq) { ldout(msgr->cct,0) << "reader got old message " << m->get_seq() << " <= " << in_seq << " " << m << " " << *m << ", discarding" << dendl; msgr->dispatch_throttle_release(m->get_dispatch_throttle_size()); m->put(); continue; } m->set_connection(connection_state->get()); // note last received message. in_seq = m->get_seq(); cond.Signal(); // wake up writer, to ack this ldout(msgr->cct,10) << "reader got message " << m->get_seq() << " " << m << " " << *m << dendl; queue_received(m); } else if (tag == CEPH_MSGR_TAG_CLOSE) { ldout(msgr->cct,20) << "reader got CLOSE" << dendl; pipe_lock.Lock(); if (state == STATE_CLOSING) state = STATE_CLOSED; else state = STATE_CLOSING; cond.Signal(); break; } else { ldout(msgr->cct,0) << "reader bad tag " << (int)tag << dendl; pipe_lock.Lock(); fault(true); } } // reap? reader_running = false; unlock_maybe_reap(); ldout(msgr->cct,10) << "reader done" << dendl; } /* write msgs to socket. * also, client. */ void Pipe::writer() { char buf[80]; pipe_lock.Lock(); while (state != STATE_CLOSED) {// && state != STATE_WAIT) { ldout(msgr->cct,10) << "writer: state = " << get_state_name() << " policy.server=" << policy.server << dendl; // standby? if (is_queued() && state == STATE_STANDBY && !policy.server) { connect_seq++; state = STATE_CONNECTING; } // connect? if (state == STATE_CONNECTING) { assert(!policy.server); connect(); continue; } if (state == STATE_CLOSING) { // write close tag ldout(msgr->cct,20) << "writer writing CLOSE tag" << dendl; char tag = CEPH_MSGR_TAG_CLOSE; state = STATE_CLOSED; pipe_lock.Unlock(); if (sd) { int r = ::write(sd, &tag, 1); // we can ignore r, actually; we don't care if this succeeds. r++; r = 0; // placate gcc } pipe_lock.Lock(); continue; } if (state != STATE_CONNECTING && state != STATE_WAIT && state != STATE_STANDBY && (is_queued() || in_seq > in_seq_acked)) { // keepalive? if (keepalive) { pipe_lock.Unlock(); int rc = write_keepalive(); pipe_lock.Lock(); if (rc < 0) { ldout(msgr->cct,2) << "writer couldn't write keepalive, " << strerror_r(errno, buf, sizeof(buf)) << dendl; fault(); continue; } keepalive = false; } // send ack? if (in_seq > in_seq_acked) { uint64_t send_seq = in_seq; pipe_lock.Unlock(); int rc = write_ack(send_seq); pipe_lock.Lock(); if (rc < 0) { ldout(msgr->cct,2) << "writer couldn't write ack, " << strerror_r(errno, buf, sizeof(buf)) << dendl; fault(); continue; } in_seq_acked = send_seq; } // grab outgoing message Message *m = _get_next_outgoing(); if (m) { m->set_seq(++out_seq); if (!policy.lossy || close_on_empty) { // put on sent list sent.push_back(m); m->get(); } pipe_lock.Unlock(); ldout(msgr->cct,20) << "writer encoding " << m->get_seq() << " " << m << " " << *m << dendl; // associate message with Connection (for benefit of encode_payload) m->set_connection(connection_state->get()); // encode and copy out of *m m->encode(connection_state->get_features(), !msgr->cct->_conf->ms_nocrc); ldout(msgr->cct,20) << "writer sending " << m->get_seq() << " " << m << dendl; int rc = write_message(m); pipe_lock.Lock(); if (rc < 0) { ldout(msgr->cct,1) << "writer error sending " << m << ", " << errno << ": " << strerror_r(errno, buf, sizeof(buf)) << dendl; fault(); } m->put(); } continue; } if (sent.empty() && close_on_empty) { ldout(msgr->cct,10) << "writer out and sent queues empty, closing" << dendl; stop(); continue; } // wait ldout(msgr->cct,20) << "writer sleeping" << dendl; cond.Wait(pipe_lock); } ldout(msgr->cct,20) << "writer finishing" << dendl; // reap? writer_running = false; unlock_maybe_reap(); ldout(msgr->cct,10) << "writer done" << dendl; } void Pipe::unlock_maybe_reap() { if (!reader_running && !writer_running) { shutdown_socket(); pipe_lock.Unlock(); msgr->queue_reap(this); } else { pipe_lock.Unlock(); } } static void alloc_aligned_buffer(bufferlist& data, unsigned len, unsigned off) { // create a buffer to read into that matches the data alignment unsigned left = len; unsigned head = 0; if (off & ~CEPH_PAGE_MASK) { // head head = MIN(CEPH_PAGE_SIZE - (off & ~CEPH_PAGE_MASK), left); bufferptr bp = buffer::create(head); data.push_back(bp); left -= head; } unsigned middle = left & CEPH_PAGE_MASK; if (middle > 0) { bufferptr bp = buffer::create_page_aligned(middle); data.push_back(bp); left -= middle; } if (left) { bufferptr bp = buffer::create(left); data.push_back(bp); } } int Pipe::read_message(Message **pm) { int ret = -1; // envelope //ldout(msgr->cct,10) << "receiver.read_message from sd " << sd << dendl; ceph_msg_header header; ceph_msg_footer footer; __u32 header_crc; if (connection_state->has_feature(CEPH_FEATURE_NOSRCADDR)) { if (tcp_read((char*)&header, sizeof(header)) < 0) return -1; header_crc = ceph_crc32c_le(0, (unsigned char *)&header, sizeof(header) - sizeof(header.crc)); } else { ceph_msg_header_old oldheader; if (tcp_read((char*)&oldheader, sizeof(oldheader)) < 0) return -1; // this is fugly memcpy(&header, &oldheader, sizeof(header)); header.src = oldheader.src.name; header.reserved = oldheader.reserved; header.crc = oldheader.crc; header_crc = ceph_crc32c_le(0, (unsigned char *)&oldheader, sizeof(oldheader) - sizeof(oldheader.crc)); } ldout(msgr->cct,20) << "reader got envelope type=" << header.type << " src " << entity_name_t(header.src) << " front=" << header.front_len << " data=" << header.data_len << " off " << header.data_off << dendl; // verify header crc if (header_crc != header.crc) { ldout(msgr->cct,0) << "reader got bad header crc " << header_crc << " != " << header.crc << dendl; return -1; } bufferlist front, middle, data; int front_len, middle_len; unsigned data_len, data_off; int aborted; Message *message; utime_t recv_stamp = ceph_clock_now(msgr->cct); bool waited_on_throttle = false; uint64_t message_size = header.front_len + header.middle_len + header.data_len; if (message_size) { if (policy.throttler) { ldout(msgr->cct,10) << "reader wants " << message_size << " from policy throttler " << policy.throttler->get_current() << "/" << policy.throttler->get_max() << dendl; waited_on_throttle = policy.throttler->get(message_size); } // throttle total bytes waiting for dispatch. do this _after_ the // policy throttle, as this one does not deadlock (unless dispatch // blocks indefinitely, which it shouldn't). in contrast, the // policy throttle carries for the lifetime of the message. ldout(msgr->cct,10) << "reader wants " << message_size << " from dispatch throttler " << msgr->dispatch_throttler.get_current() << "/" << msgr->dispatch_throttler.get_max() << dendl; waited_on_throttle |= msgr->dispatch_throttler.get(message_size); } utime_t throttle_stamp = ceph_clock_now(msgr->cct); // read front front_len = header.front_len; if (front_len) { bufferptr bp = buffer::create(front_len); if (tcp_read(bp.c_str(), front_len) < 0) goto out_dethrottle; front.push_back(bp); ldout(msgr->cct,20) << "reader got front " << front.length() << dendl; } // read middle middle_len = header.middle_len; if (middle_len) { bufferptr bp = buffer::create(middle_len); if (tcp_read(bp.c_str(), middle_len) < 0) goto out_dethrottle; middle.push_back(bp); ldout(msgr->cct,20) << "reader got middle " << middle.length() << dendl; } // read data data_len = le32_to_cpu(header.data_len); data_off = le32_to_cpu(header.data_off); if (data_len) { unsigned offset = 0; unsigned left = data_len; bufferlist newbuf, rxbuf; bufferlist::iterator blp; int rxbuf_version = 0; while (left > 0) { // wait for data if (tcp_read_wait() < 0) goto out_dethrottle; // get a buffer connection_state->lock.Lock(); map >::iterator p = connection_state->rx_buffers.find(header.tid); if (p != connection_state->rx_buffers.end()) { if (rxbuf.length() == 0 || p->second.second != rxbuf_version) { ldout(msgr->cct,10) << "reader seleting rx buffer v " << p->second.second << " at offset " << offset << " len " << p->second.first.length() << dendl; rxbuf = p->second.first; rxbuf_version = p->second.second; // make sure it's big enough if (rxbuf.length() < data_len) rxbuf.push_back(buffer::create(data_len - rxbuf.length())); blp = p->second.first.begin(); blp.advance(offset); } } else { if (!newbuf.length()) { ldout(msgr->cct,20) << "reader allocating new rx buffer at offset " << offset << dendl; alloc_aligned_buffer(newbuf, data_len, data_off); blp = newbuf.begin(); blp.advance(offset); } } bufferptr bp = blp.get_current_ptr(); int read = MIN(bp.length(), left); ldout(msgr->cct,20) << "reader reading nonblocking into " << (void*)bp.c_str() << " len " << bp.length() << dendl; int got = tcp_read_nonblocking(bp.c_str(), read); ldout(msgr->cct,30) << "reader read " << got << " of " << read << dendl; connection_state->lock.Unlock(); if (got < 0) goto out_dethrottle; if (got > 0) { blp.advance(got); data.append(bp, 0, got); offset += got; left -= got; } // else we got a signal or something; just loop. } } // footer if (tcp_read((char*)&footer, sizeof(footer)) < 0) goto out_dethrottle; aborted = (footer.flags & CEPH_MSG_FOOTER_COMPLETE) == 0; ldout(msgr->cct,10) << "aborted = " << aborted << dendl; if (aborted) { ldout(msgr->cct,0) << "reader got " << front.length() << " + " << middle.length() << " + " << data.length() << " byte message.. ABORTED" << dendl; ret = 0; goto out_dethrottle; } ldout(msgr->cct,20) << "reader got " << front.length() << " + " << middle.length() << " + " << data.length() << " byte message" << dendl; message = decode_message(msgr->cct, header, footer, front, middle, data); if (!message) { ret = -EINVAL; goto out_dethrottle; } message->set_throttler(policy.throttler); // store reservation size in message, so we don't get confused // by messages entering the dispatch queue through other paths. message->set_dispatch_throttle_size(message_size); message->set_recv_stamp(recv_stamp); message->set_throttle_stamp(throttle_stamp); message->set_recv_complete_stamp(ceph_clock_now(msgr->cct)); *pm = message; return 0; out_dethrottle: // release bytes reserved from the throttlers on failure if (message_size) { if (policy.throttler) { ldout(msgr->cct,10) << "reader releasing " << message_size << " to policy throttler " << policy.throttler->get_current() << "/" << policy.throttler->get_max() << dendl; policy.throttler->put(message_size); } msgr->dispatch_throttle_release(message_size); } return ret; } int Pipe::do_sendmsg(struct msghdr *msg, int len, bool more) { char buf[80]; while (len > 0) { if (0) { // sanity int l = 0; for (unsigned i=0; imsg_iovlen; i++) l += msg->msg_iov[i].iov_len; assert(l == len); } int r = ::sendmsg(sd, msg, MSG_NOSIGNAL | (more ? MSG_MORE : 0)); if (r == 0) ldout(msgr->cct,10) << "do_sendmsg hmm do_sendmsg got r==0!" << dendl; if (r < 0) { ldout(msgr->cct,1) << "do_sendmsg error " << strerror_r(errno, buf, sizeof(buf)) << dendl; return -1; } if (state == STATE_CLOSED) { ldout(msgr->cct,10) << "do_sendmsg oh look, state == CLOSED, giving up" << dendl; errno = EINTR; return -1; // close enough } len -= r; if (len == 0) break; // hrmph. trim r bytes off the front of our message. ldout(msgr->cct,20) << "do_sendmsg short write did " << r << ", still have " << len << dendl; while (r > 0) { if (msg->msg_iov[0].iov_len <= (size_t)r) { // lose this whole item //ldout(msgr->cct,30) << "skipping " << msg->msg_iov[0].iov_len << ", " << (msg->msg_iovlen-1) << " v, " << r << " left" << dendl; r -= msg->msg_iov[0].iov_len; msg->msg_iov++; msg->msg_iovlen--; } else { // partial! //ldout(msgr->cct,30) << "adjusting " << msg->msg_iov[0].iov_len << ", " << msg->msg_iovlen << " v, " << r << " left" << dendl; msg->msg_iov[0].iov_base = (char *)msg->msg_iov[0].iov_base + r; msg->msg_iov[0].iov_len -= r; break; } } } return 0; } int Pipe::write_ack(uint64_t seq) { ldout(msgr->cct,10) << "write_ack " << seq << dendl; char c = CEPH_MSGR_TAG_ACK; ceph_le64 s; s = seq; struct msghdr msg; memset(&msg, 0, sizeof(msg)); struct iovec msgvec[2]; msgvec[0].iov_base = &c; msgvec[0].iov_len = 1; msgvec[1].iov_base = &s; msgvec[1].iov_len = sizeof(s); msg.msg_iov = msgvec; msg.msg_iovlen = 2; if (do_sendmsg(&msg, 1 + sizeof(s), true) < 0) return -1; return 0; } int Pipe::write_keepalive() { ldout(msgr->cct,10) << "write_keepalive" << dendl; char c = CEPH_MSGR_TAG_KEEPALIVE; struct msghdr msg; memset(&msg, 0, sizeof(msg)); struct iovec msgvec[2]; msgvec[0].iov_base = &c; msgvec[0].iov_len = 1; msg.msg_iov = msgvec; msg.msg_iovlen = 1; if (do_sendmsg(&msg, 1) < 0) return -1; return 0; } int Pipe::write_message(Message *m) { ceph_msg_header& header = m->get_header(); ceph_msg_footer& footer = m->get_footer(); int ret; // get envelope, buffers header.front_len = m->get_payload().length(); header.middle_len = m->get_middle().length(); header.data_len = m->get_data().length(); footer.flags = CEPH_MSG_FOOTER_COMPLETE; m->calc_header_crc(); bufferlist blist = m->get_payload(); blist.append(m->get_middle()); blist.append(m->get_data()); ldout(msgr->cct,20) << "write_message " << m << dendl; // set up msghdr and iovecs struct msghdr msg; memset(&msg, 0, sizeof(msg)); struct iovec *msgvec = new iovec[3 + blist.buffers().size()]; // conservative upper bound msg.msg_iov = msgvec; int msglen = 0; // send tag char tag = CEPH_MSGR_TAG_MSG; msgvec[msg.msg_iovlen].iov_base = &tag; msgvec[msg.msg_iovlen].iov_len = 1; msglen++; msg.msg_iovlen++; // send envelope ceph_msg_header_old oldheader; if (connection_state->has_feature(CEPH_FEATURE_NOSRCADDR)) { msgvec[msg.msg_iovlen].iov_base = (char*)&header; msgvec[msg.msg_iovlen].iov_len = sizeof(header); msglen += sizeof(header); msg.msg_iovlen++; } else { memcpy(&oldheader, &header, sizeof(header)); oldheader.src.name = header.src; oldheader.src.addr = connection_state->get_peer_addr(); oldheader.orig_src = oldheader.src; oldheader.reserved = header.reserved; oldheader.crc = ceph_crc32c_le(0, (unsigned char*)&oldheader, sizeof(oldheader) - sizeof(oldheader.crc)); msgvec[msg.msg_iovlen].iov_base = (char*)&oldheader; msgvec[msg.msg_iovlen].iov_len = sizeof(oldheader); msglen += sizeof(oldheader); msg.msg_iovlen++; } // payload (front+data) list::const_iterator pb = blist.buffers().begin(); int b_off = 0; // carry-over buffer offset, if any int bl_pos = 0; // blist pos int left = blist.length(); while (left > 0) { int donow = MIN(left, (int)pb->length()-b_off); if (donow == 0) { ldout(msgr->cct,0) << "donow = " << donow << " left " << left << " pb->length " << pb->length() << " b_off " << b_off << dendl; } assert(donow > 0); ldout(msgr->cct,30) << " bl_pos " << bl_pos << " b_off " << b_off << " leftinchunk " << left << " buffer len " << pb->length() << " writing " << donow << dendl; if (msg.msg_iovlen >= IOV_MAX-2) { if (do_sendmsg(&msg, msglen, true)) goto fail; // and restart the iov msg.msg_iov = msgvec; msg.msg_iovlen = 0; msglen = 0; } msgvec[msg.msg_iovlen].iov_base = (void*)(pb->c_str()+b_off); msgvec[msg.msg_iovlen].iov_len = donow; msglen += donow; msg.msg_iovlen++; left -= donow; assert(left >= 0); b_off += donow; bl_pos += donow; if (left == 0) break; while (b_off == (int)pb->length()) { pb++; b_off = 0; } } assert(left == 0); // send footer msgvec[msg.msg_iovlen].iov_base = (void*)&footer; msgvec[msg.msg_iovlen].iov_len = sizeof(footer); msglen += sizeof(footer); msg.msg_iovlen++; // send if (do_sendmsg(&msg, msglen)) goto fail; ret = 0; out: delete[] msgvec; return ret; fail: ret = -1; goto out; } int Pipe::tcp_read(char *buf, int len) { if (sd < 0) return -1; while (len > 0) { if (msgr->cct->_conf->ms_inject_socket_failures && sd >= 0) { if (rand() % msgr->cct->_conf->ms_inject_socket_failures == 0) { ldout(msgr->cct, 0) << "injecting socket failure" << dendl; ::shutdown(sd, SHUT_RDWR); } } if (tcp_read_wait() < 0) return -1; int got = tcp_read_nonblocking(buf, len); if (got < 0) return -1; len -= got; buf += got; //lgeneric_dout(cct, DBL) << "tcp_read got " << got << ", " << len << " left" << dendl; } return len; } int Pipe::tcp_read_wait() { if (sd < 0) return -1; struct pollfd pfd; short evmask; pfd.fd = sd; pfd.events = POLLIN; #if defined(__linux__) pfd.events |= POLLRDHUP; #endif if (poll(&pfd, 1, msgr->timeout) <= 0) return -1; evmask = POLLERR | POLLHUP | POLLNVAL; #if defined(__linux__) evmask |= POLLRDHUP; #endif if (pfd.revents & evmask) return -1; if (!(pfd.revents & POLLIN)) return -1; return 0; } int Pipe::tcp_read_nonblocking(char *buf, int len) { again: int got = ::recv( sd, buf, len, MSG_DONTWAIT ); if (got < 0) { if (errno == EAGAIN || errno == EINTR) { goto again; } else { ldout(msgr->cct, 10) << "tcp_read_nonblocking socket " << sd << " returned " << got << " errno " << errno << " " << cpp_strerror(errno) << dendl; return -1; } } else if (got == 0) { /* poll() said there was data, but we didn't read any - peer * sent a FIN. Maybe POLLRDHUP signals this, but this is * standard socket behavior as documented by Stevens. */ return -1; } return got; } int Pipe::tcp_write(const char *buf, int len) { if (sd < 0) return -1; struct pollfd pfd; pfd.fd = sd; pfd.events = POLLOUT | POLLHUP | POLLNVAL | POLLERR; #if defined(__linux__) pfd.events |= POLLRDHUP; #endif if (msgr->cct->_conf->ms_inject_socket_failures && sd >= 0) { if (rand() % msgr->cct->_conf->ms_inject_socket_failures == 0) { ldout(msgr->cct, 0) << "injecting socket failure" << dendl; ::shutdown(sd, SHUT_RDWR); } } if (poll(&pfd, 1, -1) < 0) return -1; if (!(pfd.revents & POLLOUT)) return -1; //lgeneric_dout(cct, DBL) << "tcp_write writing " << len << dendl; assert(len > 0); while (len > 0) { int did = ::send( sd, buf, len, MSG_NOSIGNAL ); if (did < 0) { //lgeneric_dout(cct, 1) << "tcp_write error did = " << did << " errno " << errno << " " << strerror(errno) << dendl; //lgeneric_derr(cct, 1) << "tcp_write error did = " << did << " errno " << errno << " " << strerror(errno) << dendl; return did; } len -= did; buf += did; //lgeneric_dout(cct, DBL) << "tcp_write did " << did << ", " << len << " left" << dendl; } return 0; }