mirror of
https://github.com/ceph/ceph
synced 2025-01-20 10:01:45 +00:00
ceph-disk-prepare, debian/control: Support external journals.
Previously, ceph-disk-* would only let you use a journal that was a file inside the OSD data directory. With this, you can do: ceph-disk-prepare /dev/sdb /dev/sdb to put the journal as a second partition on the same disk as the OSD data (might save some file system overhead), or, more interestingly: ceph-disk-prepare /dev/sdb /dev/sdc which makes it create a new partition on /dev/sdc to use as the journal. Size of the partition is decided by $osd_journal_size. /dev/sdc must be a GPT-format disk. Multiple OSDs may share the same journal disk (using separate partitions); this way, a single fast SSD can serve as journal for multiple spinning disks. The second use case currently requires parted, so a Recommends: for parted has been added to Debian packaging. Closes: #3078 Closes: #3079 Signed-off-by: Tommi Virtanen <tv@inktank.com>
This commit is contained in:
parent
4db12511f7
commit
662c69e525
2
debian/control
vendored
2
debian/control
vendored
@ -12,7 +12,7 @@ Standards-Version: 3.9.3
|
|||||||
Package: ceph
|
Package: ceph
|
||||||
Architecture: linux-any
|
Architecture: linux-any
|
||||||
Depends: ${shlibs:Depends}, ${misc:Depends}, sdparm | hdparm, binutils, ceph-common, uuid-runtime, python, xfsprogs
|
Depends: ${shlibs:Depends}, ${misc:Depends}, sdparm | hdparm, binutils, ceph-common, uuid-runtime, python, xfsprogs
|
||||||
Recommends: ceph-mds, librados2, librbd1, btrfs-tools, gdisk
|
Recommends: ceph-mds, librados2, librbd1, btrfs-tools, gdisk, parted
|
||||||
Description: distributed storage and file system
|
Description: distributed storage and file system
|
||||||
Ceph is a distributed storage system designed to provide excellent
|
Ceph is a distributed storage system designed to provide excellent
|
||||||
performance, reliability, and scalability.
|
performance, reliability, and scalability.
|
||||||
|
@ -54,6 +54,23 @@ def write_one_line(parent, name, text):
|
|||||||
|
|
||||||
CEPH_OSD_ONDISK_MAGIC = 'ceph osd volume v026'
|
CEPH_OSD_ONDISK_MAGIC = 'ceph osd volume v026'
|
||||||
|
|
||||||
|
JOURNAL_UUID = '45b0969e-9b03-4f30-b4c6-b4b80ceff106'
|
||||||
|
|
||||||
|
|
||||||
|
# TODO depend on python2.7
|
||||||
|
def _check_output(*args, **kwargs):
|
||||||
|
process = subprocess.Popen(
|
||||||
|
stdout=subprocess.PIPE,
|
||||||
|
*args, **kwargs)
|
||||||
|
out, _ = process.communicate()
|
||||||
|
ret = process.wait()
|
||||||
|
if ret:
|
||||||
|
cmd = kwargs.get("args")
|
||||||
|
if cmd is None:
|
||||||
|
cmd = args[0]
|
||||||
|
raise subprocess.CalledProcessError(ret, cmd, output=out)
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
def get_conf(cluster, variable):
|
def get_conf(cluster, variable):
|
||||||
try:
|
try:
|
||||||
@ -86,6 +103,36 @@ def get_conf(cluster, variable):
|
|||||||
return value
|
return value
|
||||||
|
|
||||||
|
|
||||||
|
def get_conf_with_default(cluster, variable):
|
||||||
|
"""
|
||||||
|
Get a config value that is known to the C++ code.
|
||||||
|
|
||||||
|
This will fail if called on variables that are not defined in
|
||||||
|
common config options.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
out = _check_output(
|
||||||
|
args=[
|
||||||
|
'ceph-osd',
|
||||||
|
'--cluster={cluster}'.format(
|
||||||
|
cluster=cluster,
|
||||||
|
),
|
||||||
|
'--show-config-value={variable}'.format(
|
||||||
|
variable=variable,
|
||||||
|
),
|
||||||
|
],
|
||||||
|
close_fds=True,
|
||||||
|
)
|
||||||
|
except subprocess.CalledProcessError as e:
|
||||||
|
raise PrepareError(
|
||||||
|
'getting variable from configuration failed',
|
||||||
|
e,
|
||||||
|
)
|
||||||
|
|
||||||
|
value = out.split('\n', 1)[0]
|
||||||
|
return value
|
||||||
|
|
||||||
|
|
||||||
def get_fsid(cluster):
|
def get_fsid(cluster):
|
||||||
fsid = get_conf(cluster=cluster, variable='fsid')
|
fsid = get_conf(cluster=cluster, variable='fsid')
|
||||||
if fsid is None:
|
if fsid is None:
|
||||||
@ -168,8 +215,48 @@ def unmount(
|
|||||||
os.rmdir(path)
|
os.rmdir(path)
|
||||||
|
|
||||||
|
|
||||||
|
def get_free_partition_index(dev):
|
||||||
|
try:
|
||||||
|
lines = _check_output(
|
||||||
|
args=[
|
||||||
|
'parted',
|
||||||
|
'--machine',
|
||||||
|
'--',
|
||||||
|
dev,
|
||||||
|
'print',
|
||||||
|
],
|
||||||
|
)
|
||||||
|
except subprocess.CalledProcessError as e:
|
||||||
|
raise PrepareError('cannot read partition index', e)
|
||||||
|
|
||||||
|
if not lines:
|
||||||
|
raise PrepareError('parted failed to output anything')
|
||||||
|
lines = lines.splitlines(True)
|
||||||
|
|
||||||
|
if lines[0] not in ['CHS;\n', 'CYL;\n', 'BYT;\n']:
|
||||||
|
raise PrepareError('weird parted units', lines[0])
|
||||||
|
del lines[0]
|
||||||
|
|
||||||
|
if not lines[0].startswith('/dev/'):
|
||||||
|
raise PrepareError('weird parted disk entry', lines[0])
|
||||||
|
del lines[0]
|
||||||
|
|
||||||
|
seen = set()
|
||||||
|
for line in lines:
|
||||||
|
idx, _ = line.split(':', 1)
|
||||||
|
idx = int(idx)
|
||||||
|
seen.add(idx)
|
||||||
|
|
||||||
|
num = 1
|
||||||
|
while num in seen:
|
||||||
|
num += 1
|
||||||
|
return num
|
||||||
|
|
||||||
|
|
||||||
def prepare(
|
def prepare(
|
||||||
disk,
|
disk,
|
||||||
|
journal,
|
||||||
|
journal_size,
|
||||||
fstype,
|
fstype,
|
||||||
mkfs_args,
|
mkfs_args,
|
||||||
mount_options,
|
mount_options,
|
||||||
@ -184,15 +271,78 @@ def prepare(
|
|||||||
WARNING: This will unconditionally overwrite anything given to
|
WARNING: This will unconditionally overwrite anything given to
|
||||||
it.
|
it.
|
||||||
"""
|
"""
|
||||||
osd_uuid = str(uuid.uuid4())
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
# this kills the crab
|
||||||
subprocess.check_call(
|
subprocess.check_call(
|
||||||
args=[
|
args=[
|
||||||
'sgdisk',
|
'sgdisk',
|
||||||
'--zap-all',
|
'--zap-all',
|
||||||
'--clear',
|
'--clear',
|
||||||
'--mbrtogpt',
|
'--mbrtogpt',
|
||||||
|
'--',
|
||||||
|
disk,
|
||||||
|
],
|
||||||
|
)
|
||||||
|
except subprocess.CalledProcessError as e:
|
||||||
|
raise PrepareError(e)
|
||||||
|
|
||||||
|
osd_uuid = str(uuid.uuid4())
|
||||||
|
|
||||||
|
# store the partition uuid iff using external journal
|
||||||
|
journal_uuid = None
|
||||||
|
|
||||||
|
if journal is not None:
|
||||||
|
journal_uuid = str(uuid.uuid4())
|
||||||
|
|
||||||
|
if journal == disk:
|
||||||
|
# we're sharing the disk between osd data and journal;
|
||||||
|
# make journal be partition number 2, so it's pretty; put
|
||||||
|
# journal at end of free space so partitioning tools don't
|
||||||
|
# reorder them suddenly
|
||||||
|
num = 2
|
||||||
|
journal_part = '{num}:-{size}M:0'.format(
|
||||||
|
num=num,
|
||||||
|
size=journal_size,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
# sgdisk has no way for me to say "whatever is the next
|
||||||
|
# free index number" when setting type guids etc, so we
|
||||||
|
# need to awkwardly look up the next free number, and then
|
||||||
|
# fix that in the call -- and hope nobody races with us;
|
||||||
|
# then again nothing guards the partition table from races
|
||||||
|
# anyway
|
||||||
|
num = get_free_partition_index(dev=journal)
|
||||||
|
journal_part = '{num}:0:{size}M'.format(
|
||||||
|
num=num,
|
||||||
|
size=journal_size,
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
subprocess.check_call(
|
||||||
|
args=[
|
||||||
|
'sgdisk',
|
||||||
|
'--new={part}'.format(part=journal_part),
|
||||||
|
'--change-name={num}:ceph journal'.format(num=num),
|
||||||
|
'--partition-guid={num}:{journal_uuid}'.format(
|
||||||
|
num=num,
|
||||||
|
journal_uuid=journal_uuid,
|
||||||
|
),
|
||||||
|
'--typecode={num}:{uuid}'.format(
|
||||||
|
num=num,
|
||||||
|
uuid=JOURNAL_UUID,
|
||||||
|
),
|
||||||
|
'--',
|
||||||
|
journal,
|
||||||
|
],
|
||||||
|
)
|
||||||
|
except subprocess.CalledProcessError as e:
|
||||||
|
raise PrepareError(e)
|
||||||
|
|
||||||
|
try:
|
||||||
|
subprocess.check_call(
|
||||||
|
args=[
|
||||||
|
'sgdisk',
|
||||||
'--largest-new=1',
|
'--largest-new=1',
|
||||||
'--change-name=1:ceph data',
|
'--change-name=1:ceph data',
|
||||||
'--partition-guid=1:{osd_uuid}'.format(
|
'--partition-guid=1:{osd_uuid}'.format(
|
||||||
@ -226,6 +376,14 @@ def prepare(
|
|||||||
|
|
||||||
path = mount(dev=dev, fstype=fstype, options=mount_options)
|
path = mount(dev=dev, fstype=fstype, options=mount_options)
|
||||||
try:
|
try:
|
||||||
|
if journal_uuid is not None:
|
||||||
|
# we're using an external journal; point to it here
|
||||||
|
os.symlink(
|
||||||
|
'/dev/disk/by-partuuid/{journal_uuid}'.format(
|
||||||
|
journal_uuid=journal_uuid,
|
||||||
|
),
|
||||||
|
os.path.join(path, 'journal'),
|
||||||
|
)
|
||||||
write_one_line(path, 'ceph_fsid', cluster_uuid)
|
write_one_line(path, 'ceph_fsid', cluster_uuid)
|
||||||
write_one_line(path, 'fsid', osd_uuid)
|
write_one_line(path, 'fsid', osd_uuid)
|
||||||
write_one_line(path, 'magic', CEPH_OSD_ONDISK_MAGIC)
|
write_one_line(path, 'magic', CEPH_OSD_ONDISK_MAGIC)
|
||||||
@ -273,6 +431,13 @@ def parse_args():
|
|||||||
metavar='DISK',
|
metavar='DISK',
|
||||||
help='path to OSD data disk block device',
|
help='path to OSD data disk block device',
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
'journal',
|
||||||
|
metavar='JOURNAL',
|
||||||
|
nargs='?',
|
||||||
|
help=('path to OSD journal disk block device;'
|
||||||
|
+ ' leave out to store journal in file'),
|
||||||
|
)
|
||||||
parser.set_defaults(
|
parser.set_defaults(
|
||||||
# we want to hold on to this, for later
|
# we want to hold on to this, for later
|
||||||
prog=parser.prog,
|
prog=parser.prog,
|
||||||
@ -323,8 +488,16 @@ def main():
|
|||||||
),
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
journal_size = get_conf_with_default(
|
||||||
|
cluster=args.cluster,
|
||||||
|
variable='osd_journal_size',
|
||||||
|
)
|
||||||
|
journal_size = int(journal_size)
|
||||||
|
|
||||||
prepare(
|
prepare(
|
||||||
disk=args.disk,
|
disk=args.disk,
|
||||||
|
journal=args.journal,
|
||||||
|
journal_size=journal_size,
|
||||||
fstype=args.fs_type,
|
fstype=args.fs_type,
|
||||||
mkfs_args=mkfs_args,
|
mkfs_args=mkfs_args,
|
||||||
mount_options=mount_options,
|
mount_options=mount_options,
|
||||||
|
Loading…
Reference in New Issue
Block a user