ceph/qa/tasks/rgw.py
Marcus Watts 454cc8a18c Fix vault token file access.
Put the vault token file in a location that ceph can read.
Make it readable only by ceph.

On rhel8 (and indeed, any vanilla rhel machine), $HOME is liable to be
mode 700.  This means the ceph user can't read things in that user's
directory.  This causes radosgw to emit the confusing message "ERROR:
Vault token file ... not found" even though the teuthology log will
plainly show it was created and made readable by ceph.

Fixes: http://tracker.ceph.com/issues/51539
Signed-off-by: Marcus Watts <mwatts@redhat.com>
2021-10-28 14:14:10 -04:00

421 lines
16 KiB
Python

"""
rgw routines
"""
import argparse
import contextlib
import logging
from teuthology.orchestra import run
from teuthology import misc as teuthology
from teuthology import contextutil
from teuthology.exceptions import ConfigError
from tasks.ceph_manager import get_valgrind_args
from tasks.util import get_remote_for_role
from tasks.util.rgw import rgwadmin, wait_for_radosgw
from tasks.util.rados import (create_ec_pool,
create_replicated_pool,
create_cache_pool)
log = logging.getLogger(__name__)
class RGWEndpoint:
def __init__(self, hostname=None, port=None, cert=None, dns_name=None, website_dns_name=None):
self.hostname = hostname
self.port = port
self.cert = cert
self.dns_name = dns_name
self.website_dns_name = website_dns_name
def url(self):
proto = 'https' if self.cert else 'http'
return '{proto}://{hostname}:{port}/'.format(proto=proto, hostname=self.hostname, port=self.port)
@contextlib.contextmanager
def start_rgw(ctx, config, clients):
"""
Start rgw on remote sites.
"""
log.info('Starting rgw...')
testdir = teuthology.get_testdir(ctx)
for client in clients:
(remote,) = ctx.cluster.only(client).remotes.keys()
cluster_name, daemon_type, client_id = teuthology.split_role(client)
client_with_id = daemon_type + '.' + client_id
client_with_cluster = cluster_name + '.' + client_with_id
client_config = config.get(client)
if client_config is None:
client_config = {}
log.info("rgw %s config is %s", client, client_config)
cmd_prefix = [
'sudo',
'adjust-ulimits',
'ceph-coverage',
'{tdir}/archive/coverage'.format(tdir=testdir),
'daemon-helper',
'term',
]
rgw_cmd = ['radosgw']
log.info("Using %s as radosgw frontend", ctx.rgw.frontend)
endpoint = ctx.rgw.role_endpoints[client]
frontends = ctx.rgw.frontend
frontend_prefix = client_config.get('frontend_prefix', None)
if frontend_prefix:
frontends += ' prefix={pfx}'.format(pfx=frontend_prefix)
if endpoint.cert:
# add the ssl certificate path
frontends += ' ssl_certificate={}'.format(endpoint.cert.certificate)
frontends += ' ssl_port={}'.format(endpoint.port)
else:
frontends += ' port={}'.format(endpoint.port)
rgw_cmd.extend([
'--rgw-frontends', frontends,
'-n', client_with_id,
'--cluster', cluster_name,
'-k', '/etc/ceph/{client_with_cluster}.keyring'.format(client_with_cluster=client_with_cluster),
'--log-file',
'/var/log/ceph/rgw.{client_with_cluster}.log'.format(client_with_cluster=client_with_cluster),
'--rgw_ops_log_socket_path',
'{tdir}/rgw.opslog.{client_with_cluster}.sock'.format(tdir=testdir,
client_with_cluster=client_with_cluster),
])
keystone_role = client_config.get('use-keystone-role', None)
if keystone_role is not None:
if not ctx.keystone:
raise ConfigError('rgw must run after the keystone task')
url = 'http://{host}:{port}/v1/KEY_$(tenant_id)s'.format(host=endpoint.hostname,
port=endpoint.port)
ctx.keystone.create_endpoint(ctx, keystone_role, 'swift', url)
keystone_host, keystone_port = \
ctx.keystone.public_endpoints[keystone_role]
rgw_cmd.extend([
'--rgw_keystone_url',
'http://{khost}:{kport}'.format(khost=keystone_host,
kport=keystone_port),
])
if client_config.get('dns-name') is not None:
rgw_cmd.extend(['--rgw-dns-name', endpoint.dns_name])
if client_config.get('dns-s3website-name') is not None:
rgw_cmd.extend(['--rgw-dns-s3website-name', endpoint.website_dns_name])
vault_role = client_config.get('use-vault-role', None)
barbican_role = client_config.get('use-barbican-role', None)
pykmip_role = client_config.get('use-pykmip-role', None)
token_path = '/etc/ceph/vault-root-token'
if barbican_role is not None:
if not hasattr(ctx, 'barbican'):
raise ConfigError('rgw must run after the barbican task')
barbican_host, barbican_port = \
ctx.barbican.endpoints[barbican_role]
log.info("Use barbican url=%s:%s", barbican_host, barbican_port)
rgw_cmd.extend([
'--rgw_barbican_url',
'http://{bhost}:{bport}'.format(bhost=barbican_host,
bport=barbican_port),
])
elif vault_role is not None:
if not ctx.vault.root_token:
raise ConfigError('vault: no "root_token" specified')
# create token on file
ctx.rgw.vault_role = vault_role
ctx.cluster.only(client).run(args=['sudo', 'echo', '-n', ctx.vault.root_token, run.Raw('|'), 'sudo', 'tee', token_path])
log.info("Token file content")
ctx.cluster.only(client).run(args=['cat', token_path])
log.info("Restrict access to token file")
ctx.cluster.only(client).run(args=['sudo', 'chmod', '600', token_path])
ctx.cluster.only(client).run(args=['sudo', 'chown', 'ceph', token_path])
rgw_cmd.extend([
'--rgw_crypt_vault_addr', "{}:{}".format(*ctx.vault.endpoints[vault_role]),
'--rgw_crypt_vault_token_file', token_path
])
elif pykmip_role is not None:
if not hasattr(ctx, 'pykmip'):
raise ConfigError('rgw must run after the pykmip task')
ctx.rgw.pykmip_role = pykmip_role
rgw_cmd.extend([
'--rgw_crypt_kmip_addr', "{}:{}".format(*ctx.pykmip.endpoints[pykmip_role]),
])
rgw_cmd.extend([
'--foreground',
run.Raw('|'),
'sudo',
'tee',
'/var/log/ceph/rgw.{client_with_cluster}.stdout'.format(client_with_cluster=client_with_cluster),
run.Raw('2>&1'),
])
if client_config.get('valgrind'):
cmd_prefix = get_valgrind_args(
testdir,
client_with_cluster,
cmd_prefix,
client_config.get('valgrind'),
# see https://github.com/ceph/teuthology/pull/1600
exit_on_first_error=False
)
run_cmd = list(cmd_prefix)
run_cmd.extend(rgw_cmd)
ctx.daemons.add_daemon(
remote, 'rgw', client_with_id,
cluster=cluster_name,
fsid=ctx.ceph[cluster_name].fsid,
args=run_cmd,
logger=log.getChild(client),
stdin=run.PIPE,
wait=False,
)
# XXX: add_daemon() doesn't let us wait until radosgw finishes startup
for client in clients:
endpoint = ctx.rgw.role_endpoints[client]
url = endpoint.url()
log.info('Polling {client} until it starts accepting connections on {url}'.format(client=client, url=url))
(remote,) = ctx.cluster.only(client).remotes.keys()
wait_for_radosgw(url, remote)
try:
yield
finally:
for client in clients:
cluster_name, daemon_type, client_id = teuthology.split_role(client)
client_with_id = daemon_type + '.' + client_id
client_with_cluster = cluster_name + '.' + client_with_id
ctx.daemons.get_daemon('rgw', client_with_id, cluster_name).stop()
ctx.cluster.only(client).run(
args=[
'rm',
'-f',
'{tdir}/rgw.opslog.{client}.sock'.format(tdir=testdir,
client=client_with_cluster),
],
)
ctx.cluster.only(client).run(args=['sudo', 'rm', '-f', token_path])
def assign_endpoints(ctx, config, default_cert):
role_endpoints = {}
for role, client_config in config.items():
client_config = client_config or {}
remote = get_remote_for_role(ctx, role)
cert = client_config.get('ssl certificate', default_cert)
if cert:
# find the certificate created by the ssl task
if not hasattr(ctx, 'ssl_certificates'):
raise ConfigError('rgw: no ssl task found for option "ssl certificate"')
ssl_certificate = ctx.ssl_certificates.get(cert, None)
if not ssl_certificate:
raise ConfigError('rgw: missing ssl certificate "{}"'.format(cert))
else:
ssl_certificate = None
port = client_config.get('port', 443 if ssl_certificate else 80)
# if dns-name is given, use it as the hostname (or as a prefix)
dns_name = client_config.get('dns-name', '')
if len(dns_name) == 0 or dns_name.endswith('.'):
dns_name += remote.hostname
website_dns_name = client_config.get('dns-s3website-name')
if website_dns_name is not None and (len(website_dns_name) == 0 or website_dns_name.endswith('.')):
website_dns_name += remote.hostname
role_endpoints[role] = RGWEndpoint(remote.hostname, port, ssl_certificate, dns_name, website_dns_name)
return role_endpoints
@contextlib.contextmanager
def create_pools(ctx, clients):
"""Create replicated or erasure coded data pools for rgw."""
log.info('Creating data pools')
for client in clients:
log.debug("Obtaining remote for client {}".format(client))
(remote,) = ctx.cluster.only(client).remotes.keys()
data_pool = 'default.rgw.buckets.data'
cluster_name, daemon_type, client_id = teuthology.split_role(client)
if ctx.rgw.ec_data_pool:
create_ec_pool(remote, data_pool, client, ctx.rgw.data_pool_pg_size,
ctx.rgw.erasure_code_profile, cluster_name, 'rgw')
else:
create_replicated_pool(remote, data_pool, ctx.rgw.data_pool_pg_size, cluster_name, 'rgw')
index_pool = 'default.rgw.buckets.index'
create_replicated_pool(remote, index_pool, ctx.rgw.index_pool_pg_size, cluster_name, 'rgw')
if ctx.rgw.cache_pools:
create_cache_pool(remote, data_pool, data_pool + '.cache', 64,
64*1024*1024, cluster_name)
log.debug('Pools created')
yield
@contextlib.contextmanager
def configure_compression(ctx, clients, compression):
""" set a compression type in the default zone placement """
log.info('Configuring compression type = %s', compression)
for client in clients:
# XXX: the 'default' zone and zonegroup aren't created until we run RGWRados::init_complete().
# issue a 'radosgw-admin user list' command to trigger this
rgwadmin(ctx, client, cmd=['user', 'list'], check_status=True)
rgwadmin(ctx, client,
cmd=['zone', 'placement', 'modify', '--rgw-zone', 'default',
'--placement-id', 'default-placement',
'--compression', compression],
check_status=True)
yield
@contextlib.contextmanager
def configure_datacache(ctx, clients, datacache_path):
""" create directory for rgw datacache """
log.info('Preparing directory for rgw datacache at %s', datacache_path)
for client in clients:
if(datacache_path != None):
ctx.cluster.only(client).run(args=['mkdir', '-p', datacache_path])
ctx.cluster.only(client).run(args=['sudo', 'chmod', 'a+rwx', datacache_path])
else:
log.info('path for datacache was not provided')
yield
@contextlib.contextmanager
def configure_storage_classes(ctx, clients, storage_classes):
""" set a compression type in the default zone placement """
sc = [s.strip() for s in storage_classes.split(',')]
for client in clients:
# XXX: the 'default' zone and zonegroup aren't created until we run RGWRados::init_complete().
# issue a 'radosgw-admin user list' command to trigger this
rgwadmin(ctx, client, cmd=['user', 'list'], check_status=True)
for storage_class in sc:
log.info('Configuring storage class type = %s', storage_class)
rgwadmin(ctx, client,
cmd=['zonegroup', 'placement', 'add',
'--rgw-zone', 'default',
'--placement-id', 'default-placement',
'--storage-class', storage_class],
check_status=True)
rgwadmin(ctx, client,
cmd=['zone', 'placement', 'add',
'--rgw-zone', 'default',
'--placement-id', 'default-placement',
'--storage-class', storage_class,
'--data-pool', 'default.rgw.buckets.data.' + storage_class.lower()],
check_status=True)
yield
@contextlib.contextmanager
def task(ctx, config):
"""
For example, to run rgw on all clients::
tasks:
- ceph:
- rgw:
To only run on certain clients::
tasks:
- ceph:
- rgw: [client.0, client.3]
or
tasks:
- ceph:
- rgw:
client.0:
client.3:
To run radosgw through valgrind:
tasks:
- ceph:
- rgw:
client.0:
valgrind: [--tool=memcheck]
client.3:
valgrind: [--tool=memcheck]
To configure data or index pool pg_size:
overrides:
rgw:
data_pool_pg_size: 256
index_pool_pg_size: 128
"""
if config is None:
config = dict(('client.{id}'.format(id=id_), None)
for id_ in teuthology.all_roles_of_type(
ctx.cluster, 'client'))
elif isinstance(config, list):
config = dict((name, None) for name in config)
clients = config.keys() # http://tracker.ceph.com/issues/20417
overrides = ctx.config.get('overrides', {})
teuthology.deep_merge(config, overrides.get('rgw', {}))
ctx.rgw = argparse.Namespace()
ctx.rgw.ec_data_pool = bool(config.pop('ec-data-pool', False))
ctx.rgw.erasure_code_profile = config.pop('erasure_code_profile', {})
ctx.rgw.cache_pools = bool(config.pop('cache-pools', False))
ctx.rgw.frontend = config.pop('frontend', 'beast')
ctx.rgw.compression_type = config.pop('compression type', None)
ctx.rgw.storage_classes = config.pop('storage classes', None)
default_cert = config.pop('ssl certificate', None)
ctx.rgw.data_pool_pg_size = config.pop('data_pool_pg_size', 64)
ctx.rgw.index_pool_pg_size = config.pop('index_pool_pg_size', 64)
ctx.rgw.datacache = bool(config.pop('datacache', False))
ctx.rgw.datacache_path = config.pop('datacache_path', None)
ctx.rgw.config = config
log.debug("config is {}".format(config))
log.debug("client list is {}".format(clients))
ctx.rgw.role_endpoints = assign_endpoints(ctx, config, default_cert)
subtasks = [
lambda: create_pools(ctx=ctx, clients=clients),
]
if ctx.rgw.compression_type:
subtasks.extend([
lambda: configure_compression(ctx=ctx, clients=clients,
compression=ctx.rgw.compression_type),
])
if ctx.rgw.datacache:
subtasks.extend([
lambda: configure_datacache(ctx=ctx, clients=clients,
datacache_path=ctx.rgw.datacache_path),
])
if ctx.rgw.storage_classes:
subtasks.extend([
lambda: configure_storage_classes(ctx=ctx, clients=clients,
storage_classes=ctx.rgw.storage_classes),
])
subtasks.extend([
lambda: start_rgw(ctx=ctx, config=config, clients=clients),
])
with contextutil.nested(*subtasks):
yield