From 7ad8ac63fcd228508d2829fb991f74ba1d1e57e6 Mon Sep 17 00:00:00 2001
From: John Spray <jspray@redhat.com>
Date: Thu, 26 Feb 2015 16:27:13 +0000
Subject: [PATCH] tasks/cephfs: clean up core on deliberate crash

To avoid internal.coredump task synthesizing a failure
during teardown from the core we left behind.

Fixes: #10949
Signed-off-by: John Spray <john.spray@redhat.com>
---
 tasks/mds_journal_repair.py | 54 ++++++++++++++++++++++++++++++++-----
 1 file changed, 47 insertions(+), 7 deletions(-)

diff --git a/tasks/mds_journal_repair.py b/tasks/mds_journal_repair.py
index b61214ff154..62ed0534d89 100644
--- a/tasks/mds_journal_repair.py
+++ b/tasks/mds_journal_repair.py
@@ -6,11 +6,15 @@ Test our tools for recovering the content of damaged journals
 import contextlib
 import json
 import logging
+import os
 from textwrap import dedent
 import time
+from StringIO import StringIO
+import re
 from teuthology.orchestra.run import CommandFailedError
 from tasks.cephfs.filesystem import Filesystem, ObjectNotFound, ROOT_INO
 from tasks.cephfs.cephfs_test_case import CephFSTestCase, run_tests
+from teuthology.orchestra import run
 
 
 log = logging.getLogger(__name__)
@@ -213,16 +217,52 @@ class TestJournalRepair(CephFSTestCase):
 
         # See that the second MDS will crash when it starts and tries to
         # acquire rank 1
-        self.fs.mds_restart(active_mds_names[1])
-        crasher = self.fs.mds_daemons[active_mds_names[1]].proc
-
+        crasher_id = active_mds_names[1]
+        self.fs.mds_restart(crasher_id)
         try:
-            crasher.wait()
+            self.fs.mds_daemons[crasher_id].proc.wait()
         except CommandFailedError as e:
-            log.info("MDS '{0}' crashed with status {1} as expected".format(active_mds_names[1], e.exitstatus))
-            self.fs.mds_daemons[active_mds_names[1]].proc = None
+            log.info("MDS '{0}' crashed with status {1} as expected".format(crasher_id, e.exitstatus))
+            self.fs.mds_daemons[crasher_id].proc = None
+
+            # Go remove the coredump from the crash, otherwise teuthology.internal.coredump will
+            # catch it later and treat it as a failure.
+            p = self.fs.mds_daemons[crasher_id].remote.run(args=[
+                "sudo", "sysctl", "-n", "kernel.core_pattern"], stdout=StringIO())
+            core_pattern = p.stdout.getvalue().strip()
+            if os.path.dirname(core_pattern):  # Non-default core_pattern with a directory in it
+                # We have seen a core_pattern that looks like it's from teuthology's coredump
+                # task, so proceed to clear out the core file
+                log.info("Clearing core from pattern: {0}".format(core_pattern))
+
+                # Determine the PID of the crashed MDS by inspecting the MDSMap, it had
+                # to talk to the mons to get assigned a rank to reach the point of crashing
+                addr = self.fs.mon_manager.get_mds_status(crasher_id)['addr']
+                pid_str = addr.split("/")[1]
+                log.info("Determined crasher PID was {0}".format(pid_str))
+
+                # Substitute PID into core_pattern to get a glob
+                core_glob = core_pattern.replace("%p", pid_str)
+                core_glob = re.sub("%[a-z]", "*", core_glob)  # Match all for all other % tokens
+
+                # Verify that we see the expected single coredump matching the expected pattern
+                ls_proc = self.fs.mds_daemons[crasher_id].remote.run(args=[
+                    "sudo", "ls", run.Raw(core_glob)
+                ], stdout=StringIO())
+                cores = [f for f in ls_proc.stdout.getvalue().strip().split("\n") if f]
+                log.info("Enumerated cores: {0}".format(cores))
+                self.assertEqual(len(cores), 1)
+
+                log.info("Found core file {0}, deleting it".format(cores[0]))
+
+                self.fs.mds_daemons[crasher_id].remote.run(args=[
+                    "sudo", "rm", "-f", cores[0]
+                ])
+            else:
+                log.info("No core_pattern directory set, nothing to clear (internal.coredump not enabled?)")
+
         else:
-            raise RuntimeError("MDS daemon '{0}' did not crash as expected".format(active_mds_names[1]))
+            raise RuntimeError("MDS daemon '{0}' did not crash as expected".format(crasher_id))
 
         # Now it's crashed, let the MDSMonitor know that it's not coming back
         self.fs.mds_fail(active_mds_names[1])