tools: ceph-objectstore-tool is able to trim pg log dups' entries.

The main assumption is trimming just dups doesn't need any update
to the corresponding pg_info_t.

Testing:

1. cluster without the autoscaler
```
rzarz@ubulap:~/dev/ceph/build$ MON=1 MGR=1 OSD=3 MGR=1 MDS=0 ../src/vstart.sh -l -b -n -o "osd_pg_log_dups_tracked=3000000" -o "osd_pool_default_pg_autoscale_mode=off"
```

2. 8 PGs in the testing pool.
```
rzarz@ubulap:~/dev/ceph/build$ bin/ceph osd pool create test-pool 8 8
```

3. Provisioning dups with rados bench
```
bin/rados bench -p test-pool 300 write -b 4096  --no-cleanup
...
Total time run:         300.034
Total writes made:      103413
Write size:             4096
Object size:            4096
Bandwidth (MB/sec):     1.34637
Stddev Bandwidth:       0.589071
Max bandwidth (MB/sec): 2.4375
Min bandwidth (MB/sec): 0.902344
Average IOPS:           344
Stddev IOPS:            150.802
Max IOPS:               624
Min IOPS:               231
Average Latency(s):     0.0464151
Stddev Latency(s):      0.0183627
Max latency(s):         0.0928424
Min latency(s):         0.0131932
```

4. Killing osd.0
```
rzarz@ubulap:~/dev/ceph/build$ kill 2572129 # pid of osd.0
```

5. Listing PGs on osd.0 and calculating number of pg log's entries and
dups:

```
rzarz@ubulap:~/dev/ceph/build$ bin/ceph-objectstore-tool --data-path dev/osd0 --op list-pgs --pgid 2.c > osd0_pgs.txt
rzarz@ubulap:~/dev/ceph/build$ for pgid in `cat osd0_pgs.txt`; do echo $pgid; bin/ceph-objectstore-tool --data-path dev/osd0 --op log --pgid $pgid | jq '(.pg_log_t.log|length),(.pg_log_t.dups|length)'; done
2.7
10020
3100
2.6
10100
3000
2.3
10012
2800
2.1
10049
2900
2.2
10057
2700
2.0
10027
2900
2.5
10077
2700
2.4
10072
2900
1.0
97
0
```

6. Trimming dups
```
rzarz@ubulap:~/dev/ceph/build$ CEPH_ARGS="--osd_pg_log_dups_tracked 2500 --osd_pg_log_trim_max=100" bin/ceph-objectstore-tool --data-path dev/osd0 --op trim-pg-log-dups --pgid 2.7
max_dup_entries=2500 max_chunk_size=100
Removing keys dup_0000000020.00000000000000000001 - dup_0000000020.00000000000000000100
Removing keys dup_0000000020.00000000000000000101 - dup_0000000020.00000000000000000200
Removing keys dup_0000000020.00000000000000000201 - dup_0000000020.00000000000000000300
Removing keys dup_0000000020.00000000000000000301 - dup_0000000020.00000000000000000400
Removing keys dup_0000000020.00000000000000000401 - dup_0000000020.00000000000000000500
Removing keys dup_0000000020.00000000000000000501 - dup_0000000020.00000000000000000600
Finished trimming, now compacting...
Finished trimming pg log dups
```

7. Checking number of pg log's entries and dups
```
rzarz@ubulap:~/dev/ceph/build$ for pgid in `cat osd0_pgs.txt`; do echo $pgid; bin/ceph-objectstore-tool --data-path dev/osd0 --op log --pgid $pgid | jq '(.pg_log_t.log|length),(.pg_log_t.dups|length)'; done
2.7
10020
2500
2.6
10100
3000
2.3
10012
2800
2.1
10049
2900
2.2
10057
2700
2.0
10027
2900
2.5
10077
2700
2.4
10072
2900
1.0
97
0
```

Fixes: https://tracker.ceph.com/issues/53729
Signed-off-by: Radosław Zarzyński <rzarzyns@redhat.com>
This commit is contained in:
Radosław Zarzyński 2022-06-11 21:29:29 +02:00
parent e312733598
commit a2190f901a

View File

@ -736,6 +736,82 @@ int do_trim_pg_log(ObjectStore *store, const coll_t &coll,
return 0;
}
int do_trim_pg_log_dups(ObjectStore *store, const coll_t &coll,
pg_info_t &info, const spg_t &pgid,
epoch_t map_epoch,
PastIntervals &past_intervals)
{
ghobject_t oid = pgid.make_pgmeta_oid();
struct stat st;
auto ch = store->open_collection(coll);
int r = store->stat(ch, oid, &st);
ceph_assert(r == 0);
ceph_assert(st.st_size == 0);
const size_t max_dup_entries = g_ceph_context->_conf->osd_pg_log_dups_tracked;
ceph_assert(max_dup_entries > 0);
const size_t max_chunk_size = g_ceph_context->_conf->osd_pg_log_trim_max;
ceph_assert(max_chunk_size > 0);
cout << "max_dup_entries=" << max_dup_entries
<< " max_chunk_size=" << max_chunk_size << std::endl;
if (dry_run) {
cout << "Dry run enabled, so when many chunks are needed,"
<< " the trimming will never stop!" << std::endl;
}
set<string> keys_to_keep;
size_t num_removed = 0;
do {
set<string> keys_to_trim;
{
ObjectMap::ObjectMapIterator p = store->get_omap_iterator(ch, oid);
if (!p)
break;
for (p->seek_to_first(); p->valid(); p->next()) {
if (p->key()[0] == '_')
continue;
if (p->key() == "can_rollback_to")
continue;
if (p->key() == "divergent_priors")
continue;
if (p->key() == "rollback_info_trimmed_to")
continue;
if (p->key() == "may_include_deletes_in_missing")
continue;
if (p->key().substr(0, 7) == string("missing"))
continue;
if (p->key().substr(0, 4) != string("dup_"))
continue;
keys_to_keep.insert(p->key());
if (keys_to_keep.size() > max_dup_entries) {
auto oldest_to_keep = keys_to_keep.begin();
keys_to_trim.emplace(*oldest_to_keep);
keys_to_keep.erase(oldest_to_keep);
}
if (keys_to_trim.size() >= max_chunk_size) {
break;
}
}
} // deconstruct ObjectMapIterator
// delete the keys
num_removed = keys_to_trim.size();
if (!dry_run && !keys_to_trim.empty()) {
cout << "Removing keys " << *keys_to_trim.begin() << " - " << *keys_to_trim.rbegin() << std::endl;
ObjectStore::Transaction t;
t.omap_rmkeys(coll, oid, keys_to_trim);
store->queue_transaction(ch, std::move(t));
ch->flush();
}
} while (num_removed == max_chunk_size);
// compact the db since we just removed a bunch of data
cerr << "Finished trimming, now compacting..." << std::endl;
if (!dry_run)
store->compact();
return 0;
}
const int OMAP_BATCH_SIZE = 25;
void get_omap_batch(ObjectMap::ObjectMapIterator &iter, map<string, bufferlist> &oset)
{
@ -3219,12 +3295,12 @@ int main(int argc, char **argv)
("journal-path", po::value<string>(&jpath),
"path to journal, use if tool can't find it")
("pgid", po::value<string>(&pgidstr),
"PG id, mandatory for info, log, remove, export, export-remove, mark-complete, trim-pg-log, and mandatory for apply-layout-settings if --pool is not specified")
"PG id, mandatory for info, log, remove, export, export-remove, mark-complete, trim-pg-log, trim-pg-log-dups and mandatory for apply-layout-settings if --pool is not specified")
("pool", po::value<string>(&pool),
"Pool name, mandatory for apply-layout-settings if --pgid is not specified")
("op", po::value<string>(&op),
"Arg is one of [info, log, remove, mkfs, fsck, repair, fuse, dup, export, export-remove, import, list, list-slow-omap, fix-lost, list-pgs, dump-journal, dump-super, meta-list, "
"get-osdmap, set-osdmap, get-inc-osdmap, set-inc-osdmap, mark-complete, reset-last-complete, apply-layout-settings, update-mon-db, dump-export, trim-pg-log, statfs]")
"get-osdmap, set-osdmap, get-inc-osdmap, set-inc-osdmap, mark-complete, reset-last-complete, apply-layout-settings, update-mon-db, dump-export, trim-pg-log, trim-pg-log-dups statfs]")
("epoch", po::value<unsigned>(&epoch),
"epoch# for get-osdmap and get-inc-osdmap, the current epoch in use if not specified")
("file", po::value<string>(&file),
@ -3793,7 +3869,8 @@ int main(int argc, char **argv)
if ((op == "info" || op == "log" || op == "remove" || op == "export"
|| op == "export-remove" || op == "mark-complete"
|| op == "reset-last-complete"
|| op == "trim-pg-log") &&
|| op == "trim-pg-log"
|| op == "trim-pg-log-dups") &&
pgidstr.length() == 0) {
cerr << "Must provide pgid" << std::endl;
usage(desc);
@ -4020,9 +4097,9 @@ int main(int argc, char **argv)
// If not an object command nor any of the ops handled below, then output this usage
// before complaining about a bad pgid
if (!vm.count("objcmd") && op != "export" && op != "export-remove" && op != "info" && op != "log" && op != "mark-complete" && op != "trim-pg-log") {
if (!vm.count("objcmd") && op != "export" && op != "export-remove" && op != "info" && op != "log" && op != "mark-complete" && op != "trim-pg-log" && op != "trim-pg-log-dups") {
cerr << "Must provide --op (info, log, remove, mkfs, fsck, repair, export, export-remove, import, list, fix-lost, list-pgs, dump-journal, dump-super, meta-list, "
"get-osdmap, set-osdmap, get-inc-osdmap, set-inc-osdmap, mark-complete, reset-last-complete, dump-export, trim-pg-log, statfs)"
"get-osdmap, set-osdmap, get-inc-osdmap, set-inc-osdmap, mark-complete, reset-last-complete, dump-export, trim-pg-log, trim-pg-log-dups statfs)"
<< std::endl;
usage(desc);
ret = 1;
@ -4375,6 +4452,15 @@ int main(int argc, char **argv)
}
cout << "Finished trimming pg log" << std::endl;
goto out;
} else if (op == "trim-pg-log-dups") {
ret = do_trim_pg_log_dups(fs.get(), coll, info, pgid,
map_epoch, past_intervals);
if (ret < 0) {
cerr << "Error trimming pg log dups: " << cpp_strerror(ret) << std::endl;
goto out;
}
cout << "Finished trimming pg log dups" << std::endl;
goto out;
} else if (op == "reset-last-complete") {
if (!force) {
std::cerr << "WARNING: reset-last-complete is extremely dangerous and almost "