Merge PR #60381 into main

* refs/pull/60381/head:
	doc: remove refrences to `mds_log_major_segment_event_ratio`
	mds: start a new major segment after reaching minor segment threshold
	mds: make parts of mdlog reusable to be used by beacon

Reviewed-by: Anthony D Atri <anthony.datri@gmail.com>
Reviewed-by: Patrick Donnelly <pdonnell@ibm.com>
This commit is contained in:
Patrick Donnelly 2024-11-12 22:14:20 -05:00
commit 9d2b3aaa96
No known key found for this signature in database
GPG Key ID: FA47FD0B0367D313
5 changed files with 44 additions and 34 deletions

View File

@ -141,14 +141,12 @@ The targetted size of a log segment in terms of number of events is controlled b
.. confval:: mds_log_events_per_segment
The frequency of major segments (noted by the journaling of the latest ``ESubtreeMap``) is controlled by:
The number of minor mds log segments since last major segment is controlled by:
.. confval:: mds_log_major_segment_event_ratio
.. confval:: mds_log_minor_segments_per_major_segment
When ``mds_log_events_per_segment * mds_log_major_segment_event_ratio``
non-``ESubtreeMap`` events are logged, the MDS will journal a new
``ESubtreeMap``. This is necessary to allow the journal to shrink in size
during the trimming of expired segments.
This controls how often the MDS trims expired log segments (higher the value, less
often the MDS updates the journal expiry position for trimming).
The target maximum number of segments is controlled by:

View File

@ -586,16 +586,6 @@ options:
min: 1
services:
- mds
- name: mds_log_major_segment_event_ratio
type: uint
level: advanced
desc: multiple of mds_log_events_per_segment between major segments
default: 12
services:
- mds
min: 1
see_also:
- mds_log_events_per_segment
# segment size for mds log, default to default file_layout_t
- name: mds_log_segment_size
type: size
@ -1741,3 +1731,12 @@ options:
- mds
flags:
- runtime
- name: mds_log_minor_segments_per_major_segment
type: uint
level: advanced
desc: number of minor segments per major segment.
long_desc: The number of minor mds log segments since last major segment after which a major segment is started/logged.
default: 16
services:
- mds
min: 8

View File

@ -321,16 +321,15 @@ void Beacon::notify_health(MDSRank const *mds)
// Detect MDS_HEALTH_TRIM condition
// Indicates MDS is not trimming promptly
{
const auto log_max_segments = mds->mdlog->get_max_segments();
const auto log_warn_factor = g_conf().get_val<double>("mds_log_warn_factor");
if (mds->mdlog->get_num_segments() > (size_t)(log_max_segments * log_warn_factor)) {
if (mds->mdlog->is_trim_slow()) {
auto num_segments = mds->mdlog->get_num_segments();
auto max_segments = mds->mdlog->get_max_segments();
CachedStackStringStream css;
*css << "Behind on trimming (" << mds->mdlog->get_num_segments()
<< "/" << log_max_segments << ")";
*css << "Behind on trimming (" << num_segments << "/" << max_segments << ")";
MDSHealthMetric m(MDS_HEALTH_TRIM, HEALTH_WARN, css->strv());
m.metadata["num_segments"] = stringify(mds->mdlog->get_num_segments());
m.metadata["max_segments"] = stringify(log_max_segments);
m.metadata["num_segments"] = stringify(num_segments);
m.metadata["max_segments"] = stringify(max_segments);
health.metrics.push_back(m);
}
}

View File

@ -53,11 +53,12 @@ MDLog::MDLog(MDSRank* m)
event_large_threshold = g_conf().get_val<uint64_t>("mds_log_event_large_threshold");
events_per_segment = g_conf().get_val<uint64_t>("mds_log_events_per_segment");
pause = g_conf().get_val<bool>("mds_log_pause");
major_segment_event_ratio = g_conf().get_val<uint64_t>("mds_log_major_segment_event_ratio");
max_segments = g_conf().get_val<uint64_t>("mds_log_max_segments");
max_events = g_conf().get_val<int64_t>("mds_log_max_events");
skip_corrupt_events = g_conf().get_val<bool>("mds_log_skip_corrupt_events");
skip_unbounded_events = g_conf().get_val<bool>("mds_log_skip_unbounded_events");
log_warn_factor = g_conf().get_val<double>("mds_log_warn_factor");
minor_segments_per_major_segment = g_conf().get_val<uint64_t>("mds_log_minor_segments_per_major_segment");
upkeep_thread = std::thread(&MDLog::log_trim_upkeep, this);
}
@ -357,14 +358,15 @@ void MDLog::_submit_entry(LogEvent *le, MDSLogContextBase* c)
ceph_assert(!mds_is_shutting_down);
event_seq++;
events_since_last_major_segment++;
if (auto sb = dynamic_cast<SegmentBoundary*>(le); sb) {
auto ls = _start_new_segment(sb);
if (sb->is_major_segment_boundary()) {
major_segments.insert(ls->seq);
logger->set(l_mdl_segmjr, major_segments.size());
events_since_last_major_segment = 0;
minor_segments_since_last_major_segment = 0;
} else {
++minor_segments_since_last_major_segment;
}
}
@ -403,7 +405,7 @@ void MDLog::_segment_upkeep()
uint64_t period = journaler->get_layout_period();
auto ls = get_current_segment();
// start a new segment?
if (events_since_last_major_segment > events_per_segment*major_segment_event_ratio) {
if (minor_segments_since_last_major_segment > minor_segments_per_major_segment) {
dout(10) << __func__ << ": starting new major segment, current " << *ls << dendl;
auto sle = mds->mdcache->create_subtree_map();
_submit_entry(sle, NULL);
@ -656,6 +658,10 @@ void MDLog::try_to_commit_open_file_table(uint64_t last_seq)
}
}
bool MDLog::is_trim_slow() const {
return (segments.size() > (size_t)(max_segments * log_warn_factor));
}
void MDLog::log_trim_upkeep(void) {
dout(10) << dendl;
@ -1474,7 +1480,6 @@ void MDLog::_replay_thread()
}
le->set_start_off(pos);
events_since_last_major_segment++;
if (auto sb = dynamic_cast<SegmentBoundary*>(le.get()); sb) {
auto seq = sb->get_seq();
if (seq > 0) {
@ -1487,7 +1492,9 @@ void MDLog::_replay_thread()
if (sb->is_major_segment_boundary()) {
major_segments.insert(event_seq);
logger->set(l_mdl_segmjr, major_segments.size());
events_since_last_major_segment = 0;
minor_segments_since_last_major_segment = 0;
} else {
++minor_segments_since_last_major_segment;
}
} else {
event_seq++;
@ -1618,9 +1625,6 @@ void MDLog::handle_conf_change(const std::set<std::string>& changed, const MDSMa
if (changed.count("mds_log_events_per_segment")) {
events_per_segment = g_conf().get_val<uint64_t>("mds_log_events_per_segment");
}
if (changed.count("mds_log_major_segment_event_ratio")) {
major_segment_event_ratio = g_conf().get_val<uint64_t>("mds_log_major_segment_event_ratio");
}
if (changed.count("mds_log_max_events")) {
max_events = g_conf().get_val<int64_t>("mds_log_max_events");
}
@ -1642,4 +1646,10 @@ void MDLog::handle_conf_change(const std::set<std::string>& changed, const MDSMa
if (changed.count("mds_log_trim_decay_rate")){
log_trim_counter = DecayCounter(g_conf().get_val<double>("mds_log_trim_decay_rate"));
}
if (changed.count("mds_log_warn_factor")) {
log_warn_factor = g_conf().get_val<double>("mds_log_warn_factor");
}
if (changed.count("mds_log_minor_segments_per_major_segment")) {
minor_segments_per_major_segment = g_conf().get_val<uint64_t>("mds_log_minor_segments_per_major_segment");
}
}

View File

@ -173,6 +173,9 @@ public:
// replay state
std::map<inodeno_t, std::set<inodeno_t>> pending_exports;
// beacon needs me too
bool is_trim_slow() const;
protected:
struct PendingEvent {
PendingEvent(LogEvent *e, Context* c, bool f=false) : le(e), fin(c), flush(f) {}
@ -302,9 +305,9 @@ private:
bool debug_subtrees;
std::atomic_uint64_t event_large_threshold; // accessed by submit thread
uint64_t events_per_segment;
uint64_t major_segment_event_ratio;
int64_t max_events;
uint64_t max_segments;
uint64_t minor_segments_per_major_segment;
bool pause;
bool skip_corrupt_events;
bool skip_unbounded_events;
@ -312,7 +315,8 @@ private:
std::set<uint64_t> major_segments;
std::set<LogSegment*> expired_segments;
std::set<LogSegment*> expiring_segments;
uint64_t events_since_last_major_segment = 0;
uint64_t minor_segments_since_last_major_segment = 0;
double log_warn_factor;
// log trimming decay counter
DecayCounter log_trim_counter;