From 281159bf3b471f42d793ed487f201fc31ed75eb3 Mon Sep 17 00:00:00 2001 From: Kevin Cox Date: Fri, 20 Jun 2014 15:55:37 -0400 Subject: [PATCH] doc: Initial network docs. Signed-Off-By: Kevin Cox --- doc/dev/network-encoding.rst | 214 +++++++++++++++++++++++++++++++++++ doc/dev/network-protocol.rst | 197 ++++++++++++++++++++++++++++++++ 2 files changed, 411 insertions(+) create mode 100644 doc/dev/network-encoding.rst create mode 100644 doc/dev/network-protocol.rst diff --git a/doc/dev/network-encoding.rst b/doc/dev/network-encoding.rst new file mode 100644 index 00000000000..51d030e76a3 --- /dev/null +++ b/doc/dev/network-encoding.rst @@ -0,0 +1,214 @@ +================== + Network Encoding +================== + +This describes the encoding used to serialize data. It doesn't cover specific +objects/messages but focuses on the base types. + +The types are not self documenting in any way. They can not be decoded unless +you know what they are. + +Conventions +=========== + +Integers +-------- + +The integer types used will be named ``{signed}{size}{endian}``. For example +``u16le`` is an unsigned 16 bit integer encoded in little endian byte order +while ``s64be`` is a signed 64 bit integer in big endian. Additionally ``u8`` +and ``s8`` will represent signed and unsigned bytes respectively. Signed +integers use two's complement encoding. + +Complex Types +------------- + +This document will use a c-like syntax for describing structures. The +structure represents the data that will go over the wire. There will be no +padding between the elements and the elements will be sent in the order they +appear. For example:: + + struct foo { + u8 tag; + u32le data; + } + +When encoding the values ``0x05`` and ``0x12345678`` respectively will appear on +the wire as ``05 78 56 34 12``. + +Variable Arrays +--------------- + +Unlike c, length arrays can be used anywhere in structures and will be inline in +the protocol. Furthermore the length may be described using an earlier item in +the structure. + +:: + + struct blob { + u32le size; + u8 data[size]; + u32le checksum; + } + +This structure is encoded as a 32 bit size, followed by ``size`` data bytes, +then a 32 bit checksum. + +Primitive Aliases +----------------- + +These types are just aliases for primitive types. + +:: + + // From /src/include/types.h + + typedef u32le epoch_t; + typedef u32le ceph_seq_t; + typedef u64le ceph_tid_t; + typedef u64le version_t; + + +Structures +========== + +These are the way structures are encoded. Note that these structures don't +actually exist in the source but are the way that different types are encoded. + +Optional +-------- + +Optionals are represented as a presence byte, followed by the item if it exists. + +:: + + struct ceph_optional { + u8 present; + T element[present? 1 : 0]; // Only if present is non-zero. + } + +Optionals are used to encode ``boost::optional``. + +Pair +---- + +Pairs are simply the first item followed by the second. + +:: + + struct ceph_pair { + A a; + B b; + } + +Pairs are used to encode ``std::pair``. + +Triple +------ + +Triples are simply the tree elements one after another. + +:: + + struct ceph_triple { + A a; + B b; + C c; + } + +Triples are used to encode ``ceph::triple``. + + +List +---- + +Lists are represented as an element count followed by that many elements. + +:: + + struct ceph_list { + u32le length; + T elements[length]; + } + +.. note:: + The size of the elements in the list are not necessarily uniform. + +Lists are used to encode ``std::list``, ``std::vector``, ``std::deque``, +``std::set`` and ``ceph::unordered_set``. + +Blob +---- + +A Blob is simply a list of bytes. + +:: + + struct ceph_string { + ceph_list; + } + + // AKA + + struct ceph_string { + u32le size; + u8 data[size]; + } + +Blobs are used to encode ``std::string``, ``const char *`` and ``bufferlist``. + +.. note:: + The content of a Blob is arbratrary binary data. + +Map +--- + +Maps are a list of pairs. + +:: + + struct ceph_map { + ceph_list>; + } + + // AKA + + struct ceph_map { + u32le length; + ceph_pair entries[length]; + } + +Maps are used to encode ``std::map``, ``std::multimap`` and +``ceph::unordered_map``. + +Complex Types +============= + +These aren't hard to find in the source but the common ones are listed here for +convenience. + +utime_t +------- + +:: + + // From /src/include/utime.h + struct utime_t { + u32le tv_sec; // Seconds since epoch. + u32le tv_nsec; // Nanoseconds since the last second. + } + +ceph_entity_name +---------------- + +:: + + // From /src/include/msgr.h + struct ceph_entity_name { + u8 type; // CEPH_ENTITY_TYPE_* + u64le num; + } + + // CEPH_ENTITY_TYPE_* defined in /src/include/msgr.h + +.. vi: textwidth=80 noexpandtab diff --git a/doc/dev/network-protocol.rst b/doc/dev/network-protocol.rst new file mode 100644 index 00000000000..cb4c6068145 --- /dev/null +++ b/doc/dev/network-protocol.rst @@ -0,0 +1,197 @@ +================== + Network Protocol +================== + +This file describes the network protocol used by Ceph. In order to understand +the way the structures are defined it is recommended to read the introduction +of :doc:`/dev/network-encoding` first. + +Hello +===== + +The protocol starts with a handshake that confirms that both nodes are talking +ceph and shares some basic information. + +Banner +------ + +The first action is the server sending banner to the client. The banner is +defined in ``CEPH_BANNER`` from ``src/include/msgr.h``. This is followed by +the server's then client's address each encoded as a ``sockaddr_storage``. + +Once the client verifies that the servers banner matches its own it replies with +its banner and its address. + +Connect +------- + +Once the banners have been verified and the addresses exchanged the connection +negotiation begins. First the client sends a ``ceph_msg_connect`` structure +with its information. + +:: + + // From src/include/msgr.h + struct ceph_msg_connect { + u64le features; // Supported features (CEPH_FEATURE_*) + u32le host_type; // CEPH_ENTITY_TYPE_* + u32le global_seq; // Number of connections initiated by this host. + u32le connect_seq; // Number of connections initiated in this session. + u32le protocol_version; + u32le authorizer_protocol; + u32le authorizer_len; + u8 flags; // CEPH_MSG_CONNECT_* + u8 authorizer[authorizer_len]; + } + +Connect Reply +------------- + +Once the connect has been sent the connection has effectively been opened, +however the first message the server sends must be a connect reply message. + +:: + + struct ceph_msg_connect_reply { + u8 tag; // Tag indicating response code. + u64le features; + u32le global_seq; + u32le connect_seq; + u32le protocol_version; + u32le authorizer_len; + u8 flags; + u8 authorizer[authorizer_len]; + } + +MSGR Protocol +============= + +This is a low level protocol over which messages are delivered. The messages +at this level consist of a tag byte, identifying the type of message, followed +by the message data. + +:: + + // Virtual structure. + struct { + u8 tag; // CEPH_MSGR_TAG_* + u8 data[]; // Length depends on tag and data. + } + +The length of ``data`` is determined by the tag byte and depending on the +message type via information in the ``data`` array itself. + +.. note:: + There is no way to determine the length of the message if you do not + understand the type of message. + +The message tags are defined in ``src/include/msgr.h`` and the current ones +are listed below along with the data they include. Note that the defined +structures don't exist in the source and are merely for representing the +protocol. + +CEPH_MSGR_TAG_CLOSE (0x06) +-------------------------- + +:: + + struct ceph_msgr_close { + u8 tag = 0x06; + u8 data[0]; // No data. + } + +The close message indicates that the connection is being closed. + +CEPH_MSGR_TAG_MSG (0x07) +------------------------ + +:: + + struct ceph_msgr_msg { + u8 tag = 0x07; + ceph_msg_header header; + u8 front [header.front_len ]; + u8 middle[header.middle_len]; + u8 data [header.data_len ]; + ceph_msg_footer footer; + } + + // From src/include/msgr.h + struct ceph_msg_header { + u64le seq; // Sequence number. + u64le tid; // Transaction ID. + u16le type; // Message type (CEPH_MSG_* or MSG_*). + u16le priority; // Priority (higher is more important). + u16le version; // Version of message encoding. + + u32le front_len; // The size of the front section. + u32le middle_len; // The size of the middle section. + u32le data_len; // The size of the data section. + u16le data_off; // The way data should be aligned by the reciever. + + ceph_entity_name src; // Information about the sender. + + u16le compat_version; // Oldest compatible encoding version. + u16le reserved; // Unused. + u32le crc; // CRC of header. + } + + // From src/include/msgr.h + struct ceph_msg_footer { + u32le front_crc; // Checksums of the various sections. + u32le middle_crc; // + u32le data_crc; // + u64le sig; // Crypographic signature. + u8 flags; + } + +Messages are the business logic of Ceph. They are what is used to send data and +requests between nodes. The message header contains the length of the message +so unknown messages can be handled gracefully. + +There are two names for the message type constants ``CEPH_MSG_*`` and ``MSG_*``. +The only difference between the two is that the first are considered "public" +while the second is for internal use only. There is no protocol-level +difference. + +CEPH_MSGR_TAG_ACK (0x08) +------------------------ + +:: + + struct ceph_msgr_ack { + u8 tag = 0x08; + u64le seq; // The sequence number of the message being acknoledged. + } + +CEPH_MSGR_TAG_KEEPALIVE (0x09) +------------------------------ + +:: + + struct ceph_msgr_keepalive { + u8 tag = 0x09; + u8 data[0]; // No data. + } + +CEPH_MSGR_TAG_KEEPALIVE2 (0x04) +------------------------------- + +:: + + struct ceph_msgr_keepalive2 { + u8 tag = 0x0E; + utime_t timestamp; + } + +CEPH_MSGR_TAG_KEEPALIVE2_ACK (0x05) +----------------------------------- + +:: + + struct ceph_msgr_keepalive2_ack { + u8 tag = 0x0F; + utime_t timestamp; + } + +.. vi: textwidth=80 noexpandtab