mirror of
https://github.com/ceph/ceph
synced 2025-01-20 10:01:45 +00:00
Merge pull request #27465 from tchaikov/wip-38219
ceph-monstore-tool: use a large enough paxos/{first,last}_committed Reviewed-by: Neha Ojha <nojha@redhat.com>
This commit is contained in:
commit
6f58a26281
@ -441,7 +441,9 @@ deploy at least three (and preferably five) monitors in a Ceph cluster, the chan
|
||||
failure is rare. But unplanned power-downs in a data center with improperly
|
||||
configured disk/fs settings could fail the underlying file system, and hence
|
||||
kill all the monitors. In this case, we can recover the monitor store with the
|
||||
information stored in OSDs.::
|
||||
information stored in OSDs.
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
ms=/root/mon-store
|
||||
mkdir $ms
|
||||
|
@ -3,6 +3,7 @@ ceph manager -- Thrasher and CephManager objects
|
||||
"""
|
||||
from functools import wraps
|
||||
import contextlib
|
||||
import errno
|
||||
import random
|
||||
import signal
|
||||
import time
|
||||
@ -3091,13 +3092,22 @@ class CephManager:
|
||||
Loop until quorum size is reached.
|
||||
"""
|
||||
self.log('waiting for quorum size %d' % size)
|
||||
start = time.time()
|
||||
while not len(self.get_mon_quorum()) == size:
|
||||
if timeout is not None:
|
||||
assert time.time() - start < timeout, \
|
||||
('failed to reach quorum size %d '
|
||||
'before timeout expired' % size)
|
||||
time.sleep(3)
|
||||
sleep = 3
|
||||
with safe_while(sleep=sleep,
|
||||
tries=timeout // sleep,
|
||||
action=f'wait for quorum size {size}') as proceed:
|
||||
while proceed():
|
||||
try:
|
||||
if len(self.get_mon_quorum()) == size:
|
||||
break
|
||||
except CommandFailedError as e:
|
||||
# could fail instea4d of blocked if the rotating key of the
|
||||
# connected monitor is not updated yet after they form the
|
||||
# quorum
|
||||
if e.exitstatus == errno.EACCES:
|
||||
pass
|
||||
else:
|
||||
raise
|
||||
self.log("quorum is size %d" % size)
|
||||
|
||||
def get_mon_health(self, debug=False):
|
||||
|
@ -23,6 +23,7 @@
|
||||
#include "auth/KeyRing.h"
|
||||
#include "auth/cephx/CephxKeyServer.h"
|
||||
#include "global/global_init.h"
|
||||
#include "include/scope_guard.h"
|
||||
#include "include/stringify.h"
|
||||
#include "mgr/mgr_commands.h"
|
||||
#include "mon/AuthMonitor.h"
|
||||
@ -640,6 +641,24 @@ static int update_mgrmap(MonitorDBStore& st)
|
||||
|
||||
static int update_paxos(MonitorDBStore& st)
|
||||
{
|
||||
const string prefix("paxos");
|
||||
// a large enough version greater than the maximum possible `last_committed`
|
||||
// that could be replied by the peons when the leader is collecting paxos
|
||||
// transactions during recovery
|
||||
constexpr version_t first_committed = 0x42;
|
||||
constexpr version_t last_committed = first_committed;
|
||||
for (version_t v = first_committed; v < last_committed + 1; v++) {
|
||||
auto t = make_shared<MonitorDBStore::Transaction>();
|
||||
if (v == first_committed) {
|
||||
t->put(prefix, "first_committed", v);
|
||||
}
|
||||
bufferlist proposal;
|
||||
MonitorDBStore::Transaction empty_txn;
|
||||
empty_txn.encode(proposal);
|
||||
t->put(prefix, v, proposal);
|
||||
t->put(prefix, "last_committed", v);
|
||||
st.apply_transaction(t);
|
||||
}
|
||||
// build a pending paxos proposal from all non-permanent k/v pairs. once the
|
||||
// proposal is committed, it will gets applied. on the sync provider side, it
|
||||
// will be a no-op, but on its peers, the paxos commit will help to build up
|
||||
@ -658,11 +677,8 @@ static int update_paxos(MonitorDBStore& st)
|
||||
}
|
||||
t.encode(pending_proposal);
|
||||
}
|
||||
const string prefix("paxos");
|
||||
auto pending_v = last_committed + 1;
|
||||
auto t = make_shared<MonitorDBStore::Transaction>();
|
||||
t->put(prefix, "first_committed", 0);
|
||||
t->put(prefix, "last_committed", 0);
|
||||
auto pending_v = 1;
|
||||
t->put(prefix, pending_v, pending_proposal);
|
||||
t->put(prefix, "pending_v", pending_v);
|
||||
t->put(prefix, "pending_pn", 400);
|
||||
@ -821,6 +837,10 @@ int main(int argc, char **argv) {
|
||||
}
|
||||
}
|
||||
|
||||
auto close_store = make_scope_guard([&] {
|
||||
st.close();
|
||||
});
|
||||
|
||||
if (cmd == "dump-keys") {
|
||||
KeyValueDB::WholeSpaceIterator iter = st.get_iterator();
|
||||
while (iter->valid()) {
|
||||
@ -859,14 +879,12 @@ int main(int argc, char **argv) {
|
||||
int r = parse_cmd_args(&op_desc, &hidden_op_desc, &op_positional,
|
||||
subcmds, &op_vm);
|
||||
if (r < 0) {
|
||||
err = -r;
|
||||
goto done;
|
||||
return -r;
|
||||
}
|
||||
|
||||
if (op_vm.count("help") || map_type.empty()) {
|
||||
usage(argv[0], op_desc);
|
||||
err = 0;
|
||||
goto done;
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (v == 0) {
|
||||
@ -883,17 +901,16 @@ int main(int argc, char **argv) {
|
||||
if (fd < 0) {
|
||||
std::cerr << "error opening output file: "
|
||||
<< cpp_strerror(errno) << std::endl;
|
||||
err = EINVAL;
|
||||
goto done;
|
||||
return EINVAL;
|
||||
}
|
||||
}
|
||||
|
||||
BOOST_SCOPE_EXIT((&r) (&fd) (&outpath)) {
|
||||
auto close_fd = make_scope_guard([&] {
|
||||
::close(fd);
|
||||
if (r < 0 && fd != STDOUT_FILENO) {
|
||||
::remove(outpath.c_str());
|
||||
}
|
||||
} BOOST_SCOPE_EXIT_END
|
||||
});
|
||||
|
||||
bufferlist bl;
|
||||
r = 0;
|
||||
@ -912,8 +929,7 @@ int main(int argc, char **argv) {
|
||||
}
|
||||
if (r < 0) {
|
||||
std::cerr << "Error getting map: " << cpp_strerror(r) << std::endl;
|
||||
err = EINVAL;
|
||||
goto done;
|
||||
return EINVAL;
|
||||
}
|
||||
|
||||
if (op_vm.count("readable")) {
|
||||
@ -981,14 +997,12 @@ int main(int argc, char **argv) {
|
||||
int r = parse_cmd_args(&op_desc, NULL, &op_positional,
|
||||
subcmds, &op_vm);
|
||||
if (r < 0) {
|
||||
err = -r;
|
||||
goto done;
|
||||
return -r;
|
||||
}
|
||||
|
||||
if (op_vm.count("help") || map_type.empty()) {
|
||||
usage(argv[0], op_desc);
|
||||
err = 0;
|
||||
goto done;
|
||||
return 0;
|
||||
}
|
||||
|
||||
unsigned int v_first = 0;
|
||||
@ -1014,22 +1028,19 @@ int main(int argc, char **argv) {
|
||||
int r = parse_cmd_args(&op_desc, NULL, NULL,
|
||||
subcmds, &op_vm);
|
||||
if (r < 0) {
|
||||
err = -r;
|
||||
goto done;
|
||||
return -r;
|
||||
}
|
||||
|
||||
if (op_vm.count("help")) {
|
||||
usage(argv[0], op_desc);
|
||||
err = 0;
|
||||
goto done;
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (dstart > dstop) {
|
||||
std::cerr << "error: 'start' version (value: " << dstart << ") "
|
||||
<< " is greater than 'end' version (value: " << dstop << ")"
|
||||
<< std::endl;
|
||||
err = EINVAL;
|
||||
goto done;
|
||||
return EINVAL;
|
||||
}
|
||||
|
||||
version_t v = dstart;
|
||||
@ -1077,28 +1088,24 @@ int main(int argc, char **argv) {
|
||||
int r = parse_cmd_args(&op_desc, &hidden_op_desc, &op_positional,
|
||||
subcmds, &op_vm);
|
||||
if (r < 0) {
|
||||
err = -r;
|
||||
goto done;
|
||||
return -r;
|
||||
}
|
||||
|
||||
if (op_vm.count("help")) {
|
||||
usage(argv[0], op_desc);
|
||||
err = 0;
|
||||
goto done;
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (outpath.empty()) {
|
||||
usage(argv[0], op_desc);
|
||||
err = EINVAL;
|
||||
goto done;
|
||||
return EINVAL;
|
||||
}
|
||||
|
||||
if (dstart > dstop) {
|
||||
std::cerr << "error: 'start' version (value: " << dstart << ") "
|
||||
<< " is greater than 'stop' version (value: " << dstop << ")"
|
||||
<< std::endl;
|
||||
err = EINVAL;
|
||||
goto done;
|
||||
return EINVAL;
|
||||
}
|
||||
|
||||
TraceIter iter(outpath.c_str());
|
||||
@ -1153,20 +1160,17 @@ int main(int argc, char **argv) {
|
||||
po::notify(op_vm);
|
||||
} catch (po::error &e) {
|
||||
std::cerr << "error: " << e.what() << std::endl;
|
||||
err = EINVAL;
|
||||
goto done;
|
||||
return EINVAL;
|
||||
}
|
||||
|
||||
if (op_vm.count("help")) {
|
||||
usage(argv[0], op_desc);
|
||||
err = 0;
|
||||
goto done;
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (inpath.empty()) {
|
||||
usage(argv[0], op_desc);
|
||||
err = EINVAL;
|
||||
goto done;
|
||||
return EINVAL;
|
||||
}
|
||||
|
||||
unsigned num = 0;
|
||||
@ -1206,14 +1210,12 @@ int main(int argc, char **argv) {
|
||||
po::notify(op_vm);
|
||||
} catch (po::error &e) {
|
||||
std::cerr << "error: " << e.what() << std::endl;
|
||||
err = EINVAL;
|
||||
goto done;
|
||||
return EINVAL;
|
||||
}
|
||||
|
||||
if (op_vm.count("help")) {
|
||||
usage(argv[0], op_desc);
|
||||
err = 0;
|
||||
goto done;
|
||||
return 0;
|
||||
}
|
||||
|
||||
unsigned num = 0;
|
||||
@ -1236,8 +1238,7 @@ int main(int argc, char **argv) {
|
||||
} else if (cmd == "store-copy") {
|
||||
if (subcmds.size() < 1 || subcmds[0].empty()) {
|
||||
usage(argv[0], desc);
|
||||
err = EINVAL;
|
||||
goto done;
|
||||
return EINVAL;
|
||||
}
|
||||
|
||||
string out_path = subcmds[0];
|
||||
@ -1248,7 +1249,7 @@ int main(int argc, char **argv) {
|
||||
int r = out_store.create_and_open(ss);
|
||||
if (r < 0) {
|
||||
std::cerr << ss.str() << std::endl;
|
||||
goto done;
|
||||
return err;
|
||||
}
|
||||
}
|
||||
|
||||
@ -1297,10 +1298,6 @@ int main(int argc, char **argv) {
|
||||
} else {
|
||||
std::cerr << "Unrecognized command: " << cmd << std::endl;
|
||||
usage(argv[0], desc);
|
||||
goto done;
|
||||
return err;
|
||||
}
|
||||
|
||||
done:
|
||||
st.close();
|
||||
return err;
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user