ceph/qa/workunits/rados/test_crash.sh

45 lines
1.4 KiB
Bash
Raw Normal View History

#!/bin/sh
set -x
# run on a single-node three-OSD cluster
sudo killall -ABRT ceph-osd
sleep 5
# kill caused coredumps; find them and delete them, carefully, so as
# not to disturb other coredumps, or else teuthology will see them
# and assume test failure. sudos are because the core files are
# root/600
for f in $(find $TESTDIR/archive/coredump -type f); do
gdb_output=$(echo "quit" | sudo gdb /usr/bin/ceph-osd $f)
if expr match "$gdb_output" ".*generated.*ceph-osd.*" && \
( \
expr match "$gdb_output" ".*terminated.*signal 6.*" || \
expr match "$gdb_output" ".*terminated.*signal SIGABRT.*" \
)
then
sudo rm $f
fi
done
# ceph-crash runs as the unprivileged "ceph" user, but when under test
# the ceph osd daemons are running as root, so their crash files aren't
# readable. let's chown them so they behave as they would in real life.
sudo chown -R ceph:ceph /var/lib/ceph/crash
# let daemon find crashdumps on startup
sudo systemctl restart ceph-crash
sleep 30
# must be 3 crashdumps registered and moved to crash/posted
[ $(ceph crash ls | wc -l) = 4 ] || exit 1 # 4 here bc of the table header
[ $(sudo find /var/lib/ceph/crash/posted/ -name meta | wc -l) = 3 ] || exit 1
# there should be a health warning
ceph health detail | grep RECENT_CRASH || exit 1
ceph crash archive-all
sleep 30
ceph health detail | grep -c RECENT_CRASH | grep 0 # should be gone!