ceph-disk-prepare, debian/control: Support external journals.

Previously, ceph-disk-* would only let you use a journal that was a
file inside the OSD data directory. With this, you can do:

  ceph-disk-prepare /dev/sdb /dev/sdb

to put the journal as a second partition on the same disk as the OSD
data (might save some file system overhead), or, more interestingly:

  ceph-disk-prepare /dev/sdb /dev/sdc

which makes it create a new partition on /dev/sdc to use as the
journal. Size of the partition is decided by $osd_journal_size.
/dev/sdc must be a GPT-format disk. Multiple OSDs may share the same
journal disk (using separate partitions); this way, a single fast SSD
can serve as journal for multiple spinning disks.

The second use case currently requires parted, so a Recommends: for
parted has been added to Debian packaging.

Closes: #3078
Closes: #3079
Signed-off-by: Tommi Virtanen <tv@inktank.com>
This commit is contained in:
Tommi Virtanen 2012-10-05 10:57:42 -07:00
parent 4db12511f7
commit 662c69e525
2 changed files with 175 additions and 2 deletions

2
debian/control vendored
View File

@ -12,7 +12,7 @@ Standards-Version: 3.9.3
Package: ceph Package: ceph
Architecture: linux-any Architecture: linux-any
Depends: ${shlibs:Depends}, ${misc:Depends}, sdparm | hdparm, binutils, ceph-common, uuid-runtime, python, xfsprogs Depends: ${shlibs:Depends}, ${misc:Depends}, sdparm | hdparm, binutils, ceph-common, uuid-runtime, python, xfsprogs
Recommends: ceph-mds, librados2, librbd1, btrfs-tools, gdisk Recommends: ceph-mds, librados2, librbd1, btrfs-tools, gdisk, parted
Description: distributed storage and file system Description: distributed storage and file system
Ceph is a distributed storage system designed to provide excellent Ceph is a distributed storage system designed to provide excellent
performance, reliability, and scalability. performance, reliability, and scalability.

View File

@ -54,6 +54,23 @@ def write_one_line(parent, name, text):
CEPH_OSD_ONDISK_MAGIC = 'ceph osd volume v026' CEPH_OSD_ONDISK_MAGIC = 'ceph osd volume v026'
JOURNAL_UUID = '45b0969e-9b03-4f30-b4c6-b4b80ceff106'
# TODO depend on python2.7
def _check_output(*args, **kwargs):
process = subprocess.Popen(
stdout=subprocess.PIPE,
*args, **kwargs)
out, _ = process.communicate()
ret = process.wait()
if ret:
cmd = kwargs.get("args")
if cmd is None:
cmd = args[0]
raise subprocess.CalledProcessError(ret, cmd, output=out)
return out
def get_conf(cluster, variable): def get_conf(cluster, variable):
try: try:
@ -86,6 +103,36 @@ def get_conf(cluster, variable):
return value return value
def get_conf_with_default(cluster, variable):
"""
Get a config value that is known to the C++ code.
This will fail if called on variables that are not defined in
common config options.
"""
try:
out = _check_output(
args=[
'ceph-osd',
'--cluster={cluster}'.format(
cluster=cluster,
),
'--show-config-value={variable}'.format(
variable=variable,
),
],
close_fds=True,
)
except subprocess.CalledProcessError as e:
raise PrepareError(
'getting variable from configuration failed',
e,
)
value = out.split('\n', 1)[0]
return value
def get_fsid(cluster): def get_fsid(cluster):
fsid = get_conf(cluster=cluster, variable='fsid') fsid = get_conf(cluster=cluster, variable='fsid')
if fsid is None: if fsid is None:
@ -168,8 +215,48 @@ def unmount(
os.rmdir(path) os.rmdir(path)
def get_free_partition_index(dev):
try:
lines = _check_output(
args=[
'parted',
'--machine',
'--',
dev,
'print',
],
)
except subprocess.CalledProcessError as e:
raise PrepareError('cannot read partition index', e)
if not lines:
raise PrepareError('parted failed to output anything')
lines = lines.splitlines(True)
if lines[0] not in ['CHS;\n', 'CYL;\n', 'BYT;\n']:
raise PrepareError('weird parted units', lines[0])
del lines[0]
if not lines[0].startswith('/dev/'):
raise PrepareError('weird parted disk entry', lines[0])
del lines[0]
seen = set()
for line in lines:
idx, _ = line.split(':', 1)
idx = int(idx)
seen.add(idx)
num = 1
while num in seen:
num += 1
return num
def prepare( def prepare(
disk, disk,
journal,
journal_size,
fstype, fstype,
mkfs_args, mkfs_args,
mount_options, mount_options,
@ -184,15 +271,78 @@ def prepare(
WARNING: This will unconditionally overwrite anything given to WARNING: This will unconditionally overwrite anything given to
it. it.
""" """
osd_uuid = str(uuid.uuid4())
try: try:
# this kills the crab
subprocess.check_call( subprocess.check_call(
args=[ args=[
'sgdisk', 'sgdisk',
'--zap-all', '--zap-all',
'--clear', '--clear',
'--mbrtogpt', '--mbrtogpt',
'--',
disk,
],
)
except subprocess.CalledProcessError as e:
raise PrepareError(e)
osd_uuid = str(uuid.uuid4())
# store the partition uuid iff using external journal
journal_uuid = None
if journal is not None:
journal_uuid = str(uuid.uuid4())
if journal == disk:
# we're sharing the disk between osd data and journal;
# make journal be partition number 2, so it's pretty; put
# journal at end of free space so partitioning tools don't
# reorder them suddenly
num = 2
journal_part = '{num}:-{size}M:0'.format(
num=num,
size=journal_size,
)
else:
# sgdisk has no way for me to say "whatever is the next
# free index number" when setting type guids etc, so we
# need to awkwardly look up the next free number, and then
# fix that in the call -- and hope nobody races with us;
# then again nothing guards the partition table from races
# anyway
num = get_free_partition_index(dev=journal)
journal_part = '{num}:0:{size}M'.format(
num=num,
size=journal_size,
)
try:
subprocess.check_call(
args=[
'sgdisk',
'--new={part}'.format(part=journal_part),
'--change-name={num}:ceph journal'.format(num=num),
'--partition-guid={num}:{journal_uuid}'.format(
num=num,
journal_uuid=journal_uuid,
),
'--typecode={num}:{uuid}'.format(
num=num,
uuid=JOURNAL_UUID,
),
'--',
journal,
],
)
except subprocess.CalledProcessError as e:
raise PrepareError(e)
try:
subprocess.check_call(
args=[
'sgdisk',
'--largest-new=1', '--largest-new=1',
'--change-name=1:ceph data', '--change-name=1:ceph data',
'--partition-guid=1:{osd_uuid}'.format( '--partition-guid=1:{osd_uuid}'.format(
@ -226,6 +376,14 @@ def prepare(
path = mount(dev=dev, fstype=fstype, options=mount_options) path = mount(dev=dev, fstype=fstype, options=mount_options)
try: try:
if journal_uuid is not None:
# we're using an external journal; point to it here
os.symlink(
'/dev/disk/by-partuuid/{journal_uuid}'.format(
journal_uuid=journal_uuid,
),
os.path.join(path, 'journal'),
)
write_one_line(path, 'ceph_fsid', cluster_uuid) write_one_line(path, 'ceph_fsid', cluster_uuid)
write_one_line(path, 'fsid', osd_uuid) write_one_line(path, 'fsid', osd_uuid)
write_one_line(path, 'magic', CEPH_OSD_ONDISK_MAGIC) write_one_line(path, 'magic', CEPH_OSD_ONDISK_MAGIC)
@ -273,6 +431,13 @@ def parse_args():
metavar='DISK', metavar='DISK',
help='path to OSD data disk block device', help='path to OSD data disk block device',
) )
parser.add_argument(
'journal',
metavar='JOURNAL',
nargs='?',
help=('path to OSD journal disk block device;'
+ ' leave out to store journal in file'),
)
parser.set_defaults( parser.set_defaults(
# we want to hold on to this, for later # we want to hold on to this, for later
prog=parser.prog, prog=parser.prog,
@ -323,8 +488,16 @@ def main():
), ),
) )
journal_size = get_conf_with_default(
cluster=args.cluster,
variable='osd_journal_size',
)
journal_size = int(journal_size)
prepare( prepare(
disk=args.disk, disk=args.disk,
journal=args.journal,
journal_size=journal_size,
fstype=args.fs_type, fstype=args.fs_type,
mkfs_args=mkfs_args, mkfs_args=mkfs_args,
mount_options=mount_options, mount_options=mount_options,