mgr/cephadm: fix upgrade to version with agent

Signed-off-by: Adam King <adking@redhat.com>
This commit is contained in:
Adam King 2021-09-13 13:20:30 -04:00
parent 64c2a0326c
commit e2643798a9
4 changed files with 18 additions and 5 deletions

View File

@ -171,7 +171,8 @@ class HostData:
up_to_date = True
else:
# we got old counter value with message, inform agent of new timestamp
self.mgr.agent_helpers._request_agent_acks({host})
if not self.mgr.cache.messaging_agent(host):
self.mgr.agent_helpers._request_agent_acks({host})
self.mgr.log.info(
f'Received old metadata from agent on host {host}. Requested up-to-date metadata.')
@ -206,6 +207,7 @@ class AgentMessageThread(threading.Thread):
def run(self) -> None:
self.mgr.log.info(f'Sending message to agent on host {self.host}')
self.mgr.cache.sending_agent_message[self.host] = True
try:
assert self.mgr.cherrypy_thread
root_cert = self.mgr.cherrypy_thread.ssl_certs.get_root_cert()
@ -232,6 +234,7 @@ class AgentMessageThread(threading.Thread):
ssl_ctx.load_cert_chain(cert_fname, key_fname)
except Exception as e:
self.mgr.log.error(f'Failed to get certs for connecting to agent: {e}')
self.mgr.cache.sending_agent_message[self.host] = False
return
try:
bytes_len: str = str(len(self.data.encode('utf-8')))
@ -242,6 +245,7 @@ class AgentMessageThread(threading.Thread):
bytes_len = '0' + bytes_len
except Exception as e:
self.mgr.log.error(f'Failed to get length of json payload: {e}')
self.mgr.cache.sending_agent_message[self.host] = False
return
for retry_wait in [3, 5]:
try:
@ -262,8 +266,10 @@ class AgentMessageThread(threading.Thread):
except Exception as e:
# if it's not a connection error, something has gone wrong. Give up.
self.mgr.log.error(f'Failed to contact agent on host {self.host}: {e}')
self.mgr.cache.sending_agent_message[self.host] = False
return
self.mgr.log.error(f'Could not connect to agent on host {self.host}')
self.mgr.cache.sending_agent_message[self.host] = False
return
@ -273,7 +279,8 @@ class CephadmAgentHelpers:
def _request_agent_acks(self, hosts: Set[str], increment: bool = False) -> None:
for host in hosts:
self.mgr.cache.metadata_up_to_date[host] = False
if increment:
self.mgr.cache.metadata_up_to_date[host] = False
if host not in self.mgr.cache.agent_counter:
self.mgr.cache.agent_counter[host] = 1
elif increment:

View File

@ -460,6 +460,7 @@ class HostCache():
self.metadata_up_to_date = {} # type: Dict[str, bool]
self.agent_keys = {} # type: Dict[str, str]
self.agent_ports = {} # type: Dict[str, int]
self.sending_agent_message = {} # type: Dict[str, bool]
def load(self):
# type: () -> None
@ -941,6 +942,11 @@ class HostCache():
return True
return False
def messaging_agent(self, host: str) -> bool:
if host not in self.sending_agent_message or not self.sending_agent_message[host]:
return False
return True
def host_metadata_up_to_date(self, host: str) -> bool:
if host not in self.metadata_up_to_date or not self.metadata_up_to_date[host]:
return False

View File

@ -101,8 +101,6 @@ class CephadmServe:
self.mgr._schedule_daemon_action(agent.name(), 'redeploy')
if 'agent' not in self.mgr.spec_store:
self.mgr.agent_helpers._apply_agent()
for host in self.mgr.cache.get_hosts():
self.mgr.cache.metadata_up_to_date[host] = False
else:
if 'agent' in self.mgr.spec_store:
self.mgr.spec_store.rm('agent')

View File

@ -472,7 +472,8 @@ class CephadmUpgrade:
if self.mgr.use_agent and not self.mgr.cache.all_host_metadata_up_to_date():
# need to wait for metadata to come in
self.mgr.agent_helpers._request_agent_acks(
set([h for h in self.mgr.cache.get_hosts() if not self.mgr.cache.host_metadata_up_to_date(h)]))
set([h for h in self.mgr.cache.get_hosts() if
(not self.mgr.cache.host_metadata_up_to_date(h) and h in self.mgr.cache.agent_ports and not self.mgr.cache.messaging_agent(h))]))
return
first = False
@ -688,6 +689,7 @@ class CephadmUpgrade:
'redeploy',
image=target_image if not d_entry[1] else None
)
self.mgr.cache.metadata_up_to_date[d.hostname] = False
except Exception as e:
self._fail_upgrade('UPGRADE_REDEPLOY_DAEMON', {
'severity': 'warning',