diff --git a/src/pybind/mgr/cephadm/agent.py b/src/pybind/mgr/cephadm/agent.py index 694dd1cff31..b99e7e9bdc4 100644 --- a/src/pybind/mgr/cephadm/agent.py +++ b/src/pybind/mgr/cephadm/agent.py @@ -171,7 +171,8 @@ class HostData: up_to_date = True else: # we got old counter value with message, inform agent of new timestamp - self.mgr.agent_helpers._request_agent_acks({host}) + if not self.mgr.cache.messaging_agent(host): + self.mgr.agent_helpers._request_agent_acks({host}) self.mgr.log.info( f'Received old metadata from agent on host {host}. Requested up-to-date metadata.') @@ -206,6 +207,7 @@ class AgentMessageThread(threading.Thread): def run(self) -> None: self.mgr.log.info(f'Sending message to agent on host {self.host}') + self.mgr.cache.sending_agent_message[self.host] = True try: assert self.mgr.cherrypy_thread root_cert = self.mgr.cherrypy_thread.ssl_certs.get_root_cert() @@ -232,6 +234,7 @@ class AgentMessageThread(threading.Thread): ssl_ctx.load_cert_chain(cert_fname, key_fname) except Exception as e: self.mgr.log.error(f'Failed to get certs for connecting to agent: {e}') + self.mgr.cache.sending_agent_message[self.host] = False return try: bytes_len: str = str(len(self.data.encode('utf-8'))) @@ -242,6 +245,7 @@ class AgentMessageThread(threading.Thread): bytes_len = '0' + bytes_len except Exception as e: self.mgr.log.error(f'Failed to get length of json payload: {e}') + self.mgr.cache.sending_agent_message[self.host] = False return for retry_wait in [3, 5]: try: @@ -262,8 +266,10 @@ class AgentMessageThread(threading.Thread): except Exception as e: # if it's not a connection error, something has gone wrong. Give up. self.mgr.log.error(f'Failed to contact agent on host {self.host}: {e}') + self.mgr.cache.sending_agent_message[self.host] = False return self.mgr.log.error(f'Could not connect to agent on host {self.host}') + self.mgr.cache.sending_agent_message[self.host] = False return @@ -273,7 +279,8 @@ class CephadmAgentHelpers: def _request_agent_acks(self, hosts: Set[str], increment: bool = False) -> None: for host in hosts: - self.mgr.cache.metadata_up_to_date[host] = False + if increment: + self.mgr.cache.metadata_up_to_date[host] = False if host not in self.mgr.cache.agent_counter: self.mgr.cache.agent_counter[host] = 1 elif increment: diff --git a/src/pybind/mgr/cephadm/inventory.py b/src/pybind/mgr/cephadm/inventory.py index fd41ebb2c4f..a637f4ae0b9 100644 --- a/src/pybind/mgr/cephadm/inventory.py +++ b/src/pybind/mgr/cephadm/inventory.py @@ -460,6 +460,7 @@ class HostCache(): self.metadata_up_to_date = {} # type: Dict[str, bool] self.agent_keys = {} # type: Dict[str, str] self.agent_ports = {} # type: Dict[str, int] + self.sending_agent_message = {} # type: Dict[str, bool] def load(self): # type: () -> None @@ -941,6 +942,11 @@ class HostCache(): return True return False + def messaging_agent(self, host: str) -> bool: + if host not in self.sending_agent_message or not self.sending_agent_message[host]: + return False + return True + def host_metadata_up_to_date(self, host: str) -> bool: if host not in self.metadata_up_to_date or not self.metadata_up_to_date[host]: return False diff --git a/src/pybind/mgr/cephadm/serve.py b/src/pybind/mgr/cephadm/serve.py index 32787a2c834..49d254e6b4f 100644 --- a/src/pybind/mgr/cephadm/serve.py +++ b/src/pybind/mgr/cephadm/serve.py @@ -101,8 +101,6 @@ class CephadmServe: self.mgr._schedule_daemon_action(agent.name(), 'redeploy') if 'agent' not in self.mgr.spec_store: self.mgr.agent_helpers._apply_agent() - for host in self.mgr.cache.get_hosts(): - self.mgr.cache.metadata_up_to_date[host] = False else: if 'agent' in self.mgr.spec_store: self.mgr.spec_store.rm('agent') diff --git a/src/pybind/mgr/cephadm/upgrade.py b/src/pybind/mgr/cephadm/upgrade.py index 7fc2596d2fc..d7162941985 100644 --- a/src/pybind/mgr/cephadm/upgrade.py +++ b/src/pybind/mgr/cephadm/upgrade.py @@ -472,7 +472,8 @@ class CephadmUpgrade: if self.mgr.use_agent and not self.mgr.cache.all_host_metadata_up_to_date(): # need to wait for metadata to come in self.mgr.agent_helpers._request_agent_acks( - set([h for h in self.mgr.cache.get_hosts() if not self.mgr.cache.host_metadata_up_to_date(h)])) + set([h for h in self.mgr.cache.get_hosts() if + (not self.mgr.cache.host_metadata_up_to_date(h) and h in self.mgr.cache.agent_ports and not self.mgr.cache.messaging_agent(h))])) return first = False @@ -688,6 +689,7 @@ class CephadmUpgrade: 'redeploy', image=target_image if not d_entry[1] else None ) + self.mgr.cache.metadata_up_to_date[d.hostname] = False except Exception as e: self._fail_upgrade('UPGRADE_REDEPLOY_DAEMON', { 'severity': 'warning',