Merge pull request #41397 from vshankar/wip-48619

cephfs-top: display average read/write/metadata latency

Reviewed-by: Venky Shankar <vshankar@redhat.com>
This commit is contained in:
Venky Shankar 2022-03-07 10:34:15 +05:30 committed by GitHub
commit 84835e6b71
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
15 changed files with 452 additions and 49 deletions

Binary file not shown.

Before

Width:  |  Height:  |  Size: 18 KiB

After

Width:  |  Height:  |  Size: 14 KiB

View File

@ -29,7 +29,7 @@ metrics are for a particular MDS rank (e.g., number of subtrees handled by an MD
Once enabled, Ceph Filesystem metrics can be fetched via::
$ ceph fs perf stats
{"version": 1, "global_counters": ["cap_hit", "read_latency", "write_latency", "metadata_latency", "dentry_lease"], "counters": [], "client_metadata": {"client.614146": {"IP": "10.1.1.100", "hostname" : "ceph-host1", "root": "/", "mount_point": "/mnt/cephfs", "valid_metrics": ["cap_hit", "read_latency", "write_latency", "metadata_latency", "dentry_lease"]}}, "global_metrics": {"client.614146": [[0, 0], [0, 0], [0, 0], [0, 0], [0, 0]]}, "metrics": {"delayed_ranks": [], "mds.0": {"client.614146": []}}}
{"version": 1, "global_counters": ["cap_hit", "read_latency", "write_latency", "metadata_latency", "dentry_lease", "opened_files", "pinned_icaps", "opened_inodes", "avg_read_latency", "stdev_read_latency", "avg_write_latency", "stdev_write_latency", "avg_metadata_latency", "stdev_metadata_latency"], "counters": [], "client_metadata": {"client.324130": {"IP": "192.168.1.100", "hostname": "ceph-host1", "root": "/", "mount_point": "/mnt/cephfs", "valid_metrics": ["cap_hit", "read_latency", "write_latency", "metadata_latency", "dentry_lease, "opened_files", "pinned_icaps", "opened_inodes", "avg_read_latency", "stdev_read_latency", "avg_write_latency", "stdev_write_latency", "avg_metadata_latency", "stdev_metadata_latency"]}}, "global_metrics": {"client.324130": [[309785, 1280], [0, 0], [197, 519015022], [88, 279074768], [12, 70147], [0, 3], [3, 3], [0, 3], [0, 0], [0, 0], [0, 11699223], [0, 88245], [0, 6596951], [0, 9539]]}, "metrics": {"delayed_ranks": [], "mds.0": {"client.324130": []}}}
Details of the JSON command output are as follows:

View File

@ -43,18 +43,6 @@ Descriptions of fields
cap hit rate
.. describe:: rlat
read latency
.. describe:: wlat
write latency
.. describe:: mlat
metadata latency
.. describe:: dlease
dentry lease rate
@ -95,6 +83,29 @@ Descriptions of fields
speed of write IOs compared with the last refresh
.. describe:: rlatavg
average read latency
.. describe:: rlatsd
standard deviation (variance) for read latency
.. describe:: wlatavg
average write latency
.. describe:: wlatsd
standard deviation (variance) for write latency
.. describe:: mlatavg
average metadata latency
.. describe:: mlatsd
standard deviation (variance) for metadata latency
Availability
============

View File

@ -176,6 +176,30 @@ bool Client::is_reserved_vino(vinodeno_t &vino) {
return false;
}
// running average and standard deviation -- presented in
// Donald Knuth's TAoCP, Volume II.
double calc_average(double old_avg, double value, uint64_t count) {
double new_avg;
if (count == 1) {
new_avg = value;
} else {
new_avg = old_avg + ((value - old_avg) / count);
}
return new_avg;
}
double calc_sq_sum(double old_sq_sum, double old_mean, double new_mean,
double value, uint64_t count) {
double new_sq_sum;
if (count == 1) {
new_sq_sum = 0.0;
} else {
new_sq_sum = old_sq_sum + (value - old_mean)*(value - new_mean);
}
return new_sq_sum;
}
// -------------
@ -583,6 +607,16 @@ void Client::_finish_init()
plb.add_time_avg(l_c_wrlat, "wrlat", "Latency of a file data write operation");
plb.add_time_avg(l_c_read, "rdlat", "Latency of a file data read operation");
plb.add_time_avg(l_c_fsync, "fsync", "Latency of a file sync operation");
// average, standard deviation mds/r/w/ latencies
plb.add_time(l_c_md_avg, "mdavg", "Average latency for processing metadata requests");
plb.add_u64(l_c_md_sqsum, "mdsqsum", "Sum of squares (to calculate variability/stdev) for metadata requests");
plb.add_u64(l_c_md_ops, "mdops", "Total metadata IO operations");
plb.add_time(l_c_rd_avg, "readavg", "Average latency for processing read requests");
plb.add_u64(l_c_rd_sqsum, "readsqsum", "Sum of squares ((to calculate variability/stdev) for read requests");
plb.add_u64(l_c_rd_ops, "rdops", "Total read IO operations");
plb.add_time(l_c_wr_avg, "writeavg", "Average latency for processing write requests");
plb.add_u64(l_c_wr_sqsum, "writesqsum", "Sum of squares ((to calculate variability/stdev) for write requests");
plb.add_u64(l_c_wr_ops, "rdops", "Total write IO operations");
logger.reset(plb.create_perf_counters());
cct->get_perfcounters_collection()->add(logger.get());
}
@ -708,6 +742,63 @@ void Client::shutdown()
}
}
void Client::update_io_stat_metadata(utime_t latency) {
auto lat_nsec = latency.to_nsec();
// old values are used to compute new ones
auto o_avg = logger->tget(l_c_md_avg).to_nsec();
auto o_sqsum = logger->get(l_c_md_sqsum);
auto n_avg = calc_average(o_avg, lat_nsec, nr_metadata_request);
auto n_sqsum = calc_sq_sum(o_sqsum, o_avg, n_avg, lat_nsec,
nr_metadata_request);
logger->tinc(l_c_lat, latency);
logger->tinc(l_c_reply, latency);
utime_t avg;
avg.set_from_double(n_avg / 1000000000);
logger->tset(l_c_md_avg, avg);
logger->set(l_c_md_sqsum, n_sqsum);
logger->set(l_c_md_ops, nr_metadata_request);
}
void Client::update_io_stat_read(utime_t latency) {
auto lat_nsec = latency.to_nsec();
// old values are used to compute new ones
auto o_avg = logger->tget(l_c_rd_avg).to_nsec();
auto o_sqsum = logger->get(l_c_rd_sqsum);
auto n_avg = calc_average(o_avg, lat_nsec, nr_read_request);
auto n_sqsum = calc_sq_sum(o_sqsum, o_avg, n_avg, lat_nsec,
nr_read_request);
logger->tinc(l_c_read, latency);
utime_t avg;
avg.set_from_double(n_avg / 1000000000);
logger->tset(l_c_rd_avg, avg);
logger->set(l_c_rd_sqsum, n_sqsum);
logger->set(l_c_rd_ops, nr_read_request);
}
void Client::update_io_stat_write(utime_t latency) {
auto lat_nsec = latency.to_nsec();
// old values are used to compute new ones
auto o_avg = logger->tget(l_c_wr_avg).to_nsec();
auto o_sqsum = logger->get(l_c_wr_sqsum);
auto n_avg = calc_average(o_avg, lat_nsec, nr_write_request);
auto n_sqsum = calc_sq_sum(o_sqsum, o_avg, n_avg, lat_nsec,
nr_write_request);
logger->tinc(l_c_wrlat, latency);
utime_t avg;
avg.set_from_double(n_avg / 1000000000);
logger->tset(l_c_wr_avg, avg);
logger->set(l_c_wr_sqsum, n_sqsum);
logger->set(l_c_wr_ops, nr_write_request);
}
// ===================
// metadata cache stuff
@ -1915,8 +2006,9 @@ int Client::make_request(MetaRequest *request,
utime_t lat = ceph_clock_now();
lat -= request->sent_stamp;
ldout(cct, 20) << "lat " << lat << dendl;
logger->tinc(l_c_lat, lat);
logger->tinc(l_c_reply, lat);
++nr_metadata_request;
update_io_stat_metadata(lat);
put_request(request);
return r;
@ -6675,15 +6767,24 @@ void Client::collect_and_send_global_metrics() {
std::vector<ClientMetricMessage> message;
// read latency
metric = ClientMetricMessage(ReadLatencyPayload(logger->tget(l_c_read)));
metric = ClientMetricMessage(ReadLatencyPayload(logger->tget(l_c_read),
logger->tget(l_c_rd_avg),
logger->get(l_c_rd_sqsum),
nr_read_request));
message.push_back(metric);
// write latency
metric = ClientMetricMessage(WriteLatencyPayload(logger->tget(l_c_wrlat)));
metric = ClientMetricMessage(WriteLatencyPayload(logger->tget(l_c_wrlat),
logger->tget(l_c_wr_avg),
logger->get(l_c_wr_sqsum),
nr_write_request));
message.push_back(metric);
// metadata latency
metric = ClientMetricMessage(MetadataLatencyPayload(logger->tget(l_c_lat)));
metric = ClientMetricMessage(MetadataLatencyPayload(logger->tget(l_c_lat),
logger->tget(l_c_md_avg),
logger->get(l_c_md_sqsum),
nr_metadata_request));
message.push_back(metric);
// cap hit ratio -- nr_caps is unused right now
@ -10132,7 +10233,9 @@ success:
lat = ceph_clock_now();
lat -= start;
logger->tinc(l_c_read, lat);
++nr_read_request;
update_io_stat_read(lat);
done:
// done!
@ -10592,7 +10695,9 @@ success:
// time
lat = ceph_clock_now();
lat -= start;
logger->tinc(l_c_wrlat, lat);
++nr_write_request;
update_io_stat_write(lat);
if (fpos) {
lock_fh_pos(f);

View File

@ -77,6 +77,15 @@ enum {
l_c_wrlat,
l_c_read,
l_c_fsync,
l_c_md_avg,
l_c_md_sqsum,
l_c_md_ops,
l_c_rd_avg,
l_c_rd_sqsum,
l_c_rd_ops,
l_c_wr_avg,
l_c_wr_sqsum,
l_c_wr_ops,
l_c_last,
};
@ -1465,6 +1474,10 @@ private:
void collect_and_send_metrics();
void collect_and_send_global_metrics();
void update_io_stat_metadata(utime_t latency);
void update_io_stat_read(utime_t latency);
void update_io_stat_write(utime_t latency);
uint32_t deleg_timeout = 0;
client_switch_interrupt_callback_t switch_interrupt_cb = nullptr;
@ -1582,6 +1595,10 @@ private:
ceph::spinlock delay_i_lock;
std::map<Inode*,int> delay_i_release;
uint64_t nr_metadata_request = 0;
uint64_t nr_read_request = 0;
uint64_t nr_write_request = 0;
};
/**

View File

@ -27,6 +27,12 @@ enum ClientMetricType {
CLIENT_METRIC_TYPE_OPENED_INODES,
CLIENT_METRIC_TYPE_READ_IO_SIZES,
CLIENT_METRIC_TYPE_WRITE_IO_SIZES,
CLIENT_METRIC_TYPE_AVG_READ_LATENCY,
CLIENT_METRIC_TYPE_STDEV_READ_LATENCY,
CLIENT_METRIC_TYPE_AVG_WRITE_LATENCY,
CLIENT_METRIC_TYPE_STDEV_WRITE_LATENCY,
CLIENT_METRIC_TYPE_AVG_METADATA_LATENCY,
CLIENT_METRIC_TYPE_STDEV_METADATA_LATENCY,
};
inline std::ostream &operator<<(std::ostream &os, const ClientMetricType &type) {
switch(type) {
@ -60,6 +66,24 @@ inline std::ostream &operator<<(std::ostream &os, const ClientMetricType &type)
case ClientMetricType::CLIENT_METRIC_TYPE_WRITE_IO_SIZES:
os << "WRITE_IO_SIZES";
break;
case ClientMetricType::CLIENT_METRIC_TYPE_AVG_READ_LATENCY:
os << "AVG_READ_LATENCY";
break;
case ClientMetricType::CLIENT_METRIC_TYPE_STDEV_READ_LATENCY:
os << "STDEV_READ_LATENCY";
break;
case ClientMetricType::CLIENT_METRIC_TYPE_AVG_WRITE_LATENCY:
os << "AVG_WRITE_LATENCY";
break;
case ClientMetricType::CLIENT_METRIC_TYPE_STDEV_WRITE_LATENCY:
os << "STDEV_WRITE_LATENCY";
break;
case ClientMetricType::CLIENT_METRIC_TYPE_AVG_METADATA_LATENCY:
os << "AVG_METADATA_LATENCY";
break;
case ClientMetricType::CLIENT_METRIC_TYPE_STDEV_METADATA_LATENCY:
os << "STDEV_METADATA_LATENCY";
break;
default:
os << "(UNKNOWN:" << static_cast<std::underlying_type<ClientMetricType>::type>(type) << ")";
break;
@ -128,97 +152,154 @@ struct CapInfoPayload : public ClientMetricPayloadBase {
struct ReadLatencyPayload : public ClientMetricPayloadBase {
utime_t lat;
utime_t mean;
uint64_t sq_sum; // sum of squares
uint64_t count; // IO count
ReadLatencyPayload()
: ClientMetricPayloadBase(ClientMetricType::CLIENT_METRIC_TYPE_READ_LATENCY) { }
ReadLatencyPayload(utime_t lat)
: ClientMetricPayloadBase(ClientMetricType::CLIENT_METRIC_TYPE_READ_LATENCY), lat(lat) {
ReadLatencyPayload(utime_t lat, utime_t mean, uint64_t sq_sum, uint64_t count)
: ClientMetricPayloadBase(ClientMetricType::CLIENT_METRIC_TYPE_READ_LATENCY),
lat(lat),
mean(mean),
sq_sum(sq_sum),
count(count) {
}
void encode(bufferlist &bl) const {
using ceph::encode;
ENCODE_START(1, 1, bl);
ENCODE_START(2, 1, bl);
encode(lat, bl);
encode(mean, bl);
encode(sq_sum, bl);
encode(count, bl);
ENCODE_FINISH(bl);
}
void decode(bufferlist::const_iterator &iter) {
using ceph::decode;
DECODE_START(1, iter);
DECODE_START(2, iter);
decode(lat, iter);
if (struct_v >= 2) {
decode(mean, iter);
decode(sq_sum, iter);
decode(count, iter);
}
DECODE_FINISH(iter);
}
void dump(Formatter *f) const {
f->dump_int("latency", lat);
f->dump_int("avg_latency", mean);
f->dump_unsigned("sq_sum", sq_sum);
f->dump_unsigned("count", count);
}
void print(std::ostream *out) const {
*out << "latency: " << lat;
*out << "latency: " << lat << ", avg_latency: " << mean
<< ", sq_sum: " << sq_sum << ", count=" << count;
}
};
struct WriteLatencyPayload : public ClientMetricPayloadBase {
utime_t lat;
utime_t mean;
uint64_t sq_sum; // sum of squares
uint64_t count; // IO count
WriteLatencyPayload()
: ClientMetricPayloadBase(ClientMetricType::CLIENT_METRIC_TYPE_WRITE_LATENCY) { }
WriteLatencyPayload(utime_t lat)
: ClientMetricPayloadBase(ClientMetricType::CLIENT_METRIC_TYPE_WRITE_LATENCY), lat(lat) {
WriteLatencyPayload(utime_t lat, utime_t mean, uint64_t sq_sum, uint64_t count)
: ClientMetricPayloadBase(ClientMetricType::CLIENT_METRIC_TYPE_WRITE_LATENCY),
lat(lat),
mean(mean),
sq_sum(sq_sum),
count(count){
}
void encode(bufferlist &bl) const {
using ceph::encode;
ENCODE_START(1, 1, bl);
ENCODE_START(2, 1, bl);
encode(lat, bl);
encode(mean, bl);
encode(sq_sum, bl);
encode(count, bl);
ENCODE_FINISH(bl);
}
void decode(bufferlist::const_iterator &iter) {
using ceph::decode;
DECODE_START(1, iter);
DECODE_START(2, iter);
decode(lat, iter);
if (struct_v >= 2) {
decode(mean, iter);
decode(sq_sum, iter);
decode(count, iter);
}
DECODE_FINISH(iter);
}
void dump(Formatter *f) const {
f->dump_int("latency", lat);
f->dump_int("avg_latency", mean);
f->dump_unsigned("sq_sum", sq_sum);
f->dump_unsigned("count", count);
}
void print(std::ostream *out) const {
*out << "latency: " << lat;
*out << "latency: " << lat << ", avg_latency: " << mean
<< ", sq_sum: " << sq_sum << ", count=" << count;
}
};
struct MetadataLatencyPayload : public ClientMetricPayloadBase {
utime_t lat;
utime_t mean;
uint64_t sq_sum; // sum of squares
uint64_t count; // IO count
MetadataLatencyPayload()
: ClientMetricPayloadBase(ClientMetricType::CLIENT_METRIC_TYPE_METADATA_LATENCY) { }
MetadataLatencyPayload(utime_t lat)
: ClientMetricPayloadBase(ClientMetricType::CLIENT_METRIC_TYPE_METADATA_LATENCY), lat(lat) {
: ClientMetricPayloadBase(ClientMetricType::CLIENT_METRIC_TYPE_METADATA_LATENCY) { }
MetadataLatencyPayload(utime_t lat, utime_t mean, uint64_t sq_sum, uint64_t count)
: ClientMetricPayloadBase(ClientMetricType::CLIENT_METRIC_TYPE_METADATA_LATENCY),
lat(lat),
mean(mean),
sq_sum(sq_sum),
count(count) {
}
void encode(bufferlist &bl) const {
using ceph::encode;
ENCODE_START(1, 1, bl);
ENCODE_START(2, 1, bl);
encode(lat, bl);
encode(mean, bl);
encode(sq_sum, bl);
encode(count, bl);
ENCODE_FINISH(bl);
}
void decode(bufferlist::const_iterator &iter) {
using ceph::decode;
DECODE_START(1, iter);
DECODE_START(2, iter);
decode(lat, iter);
if (struct_v >= 2) {
decode(mean, iter);
decode(sq_sum, iter);
decode(count, iter);
}
DECODE_FINISH(iter);
}
void dump(Formatter *f) const {
f->dump_int("latency", lat);
f->dump_int("avg_latency", mean);
f->dump_unsigned("sq_sum", sq_sum);
f->dump_unsigned("count", count);
}
void print(std::ostream *out) const {
*out << "latency: " << lat;
*out << "latency: " << lat << ", avg_latency: " << mean
<< ", sq_sum: " << sq_sum << ", count=" << count;
}
};

View File

@ -39,66 +39,102 @@ struct CapHitMetric {
struct ReadLatencyMetric {
utime_t lat;
utime_t mean;
uint64_t sq_sum;
uint64_t count;
bool updated = false;
DENC(ReadLatencyMetric, v, p) {
DENC_START(2, 1, p);
DENC_START(3, 1, p);
denc(v.lat, p);
if (struct_v >= 2)
denc(v.updated, p);
if (struct_v >= 3) {
denc(v.mean, p);
denc(v.sq_sum, p);
denc(v.count, p);
}
DENC_FINISH(p);
}
void dump(Formatter *f) const {
f->dump_object("read_latency", lat);
f->dump_object("avg_read_alatency", mean);
f->dump_unsigned("sq_sum", sq_sum);
f->dump_unsigned("count", count);
}
friend std::ostream& operator<<(std::ostream& os, const ReadLatencyMetric &metric) {
os << "{latency=" << metric.lat << "}";
os << "{latency=" << metric.lat << ", avg_latency=" << metric.mean
<< ", sq_sum=" << metric.sq_sum << ", count=" << metric.count << "}";
return os;
}
};
struct WriteLatencyMetric {
utime_t lat;
utime_t mean;
uint64_t sq_sum;
uint64_t count;
bool updated = false;
DENC(WriteLatencyMetric, v, p) {
DENC_START(2, 1, p);
DENC_START(3, 1, p);
denc(v.lat, p);
if (struct_v >= 2)
denc(v.updated, p);
if (struct_v >= 3) {
denc(v.mean, p);
denc(v.sq_sum, p);
denc(v.count, p);
}
DENC_FINISH(p);
}
void dump(Formatter *f) const {
f->dump_object("write_latency", lat);
f->dump_object("avg_write_alatency", mean);
f->dump_unsigned("sq_sum", sq_sum);
f->dump_unsigned("count", count);
}
friend std::ostream& operator<<(std::ostream& os, const WriteLatencyMetric &metric) {
os << "{latency=" << metric.lat << "}";
os << "{latency=" << metric.lat << ", avg_latency=" << metric.mean
<< ", sq_sum=" << metric.sq_sum << ", count=" << metric.count << "}";
return os;
}
};
struct MetadataLatencyMetric {
utime_t lat;
utime_t mean;
uint64_t sq_sum;
uint64_t count;
bool updated = false;
DENC(MetadataLatencyMetric, v, p) {
DENC_START(2, 1, p);
DENC_START(3, 1, p);
denc(v.lat, p);
if (struct_v >= 2)
denc(v.updated, p);
if (struct_v >= 3) {
denc(v.mean, p);
denc(v.sq_sum, p);
denc(v.count, p);
}
DENC_FINISH(p);
}
void dump(Formatter *f) const {
f->dump_object("metadata_latency", lat);
f->dump_object("avg_metadata_alatency", mean);
f->dump_unsigned("sq_sum", sq_sum);
f->dump_unsigned("count", count);
}
friend std::ostream& operator<<(std::ostream& os, const MetadataLatencyMetric &metric) {
os << "{latency=" << metric.lat << "}";
os << "{latency=" << metric.lat << ", avg_latency=" << metric.mean
<< ", sq_sum=" << metric.sq_sum << ", count=" << metric.count << "}";
return os;
}
};

View File

@ -168,6 +168,42 @@ void MetricAggregator::refresh_metrics_for_rank(const entity_inst_t &client,
c->second = metrics.write_io_sizes_metric.total_size;
}
break;
case MDSPerformanceCounterType::AVG_READ_LATENCY_METRIC:
if (metrics.read_latency_metric.updated) {
c->first = metrics.read_latency_metric.mean.tv.tv_sec;
c->second = metrics.read_latency_metric.mean.tv.tv_nsec;
}
break;
case MDSPerformanceCounterType::STDEV_READ_LATENCY_METRIC:
if (metrics.read_latency_metric.updated) {
c->first = metrics.read_latency_metric.sq_sum;
c->second = metrics.read_latency_metric.count;
}
break;
case MDSPerformanceCounterType::AVG_WRITE_LATENCY_METRIC:
if (metrics.write_latency_metric.updated) {
c->first = metrics.write_latency_metric.mean.tv.tv_sec;
c->second = metrics.write_latency_metric.mean.tv.tv_nsec;
}
break;
case MDSPerformanceCounterType::STDEV_WRITE_LATENCY_METRIC:
if (metrics.write_latency_metric.updated) {
c->first = metrics.write_latency_metric.sq_sum;
c->second = metrics.write_latency_metric.count;
}
break;
case MDSPerformanceCounterType::AVG_METADATA_LATENCY_METRIC:
if (metrics.metadata_latency_metric.updated) {
c->first = metrics.metadata_latency_metric.mean.tv.tv_sec;
c->second = metrics.metadata_latency_metric.mean.tv.tv_nsec;
}
break;
case MDSPerformanceCounterType::STDEV_METADATA_LATENCY_METRIC:
if (metrics.metadata_latency_metric.updated) {
c->first = metrics.metadata_latency_metric.sq_sum;
c->second = metrics.metadata_latency_metric.count;
}
break;
default:
ceph_abort_msg("unknown counter type");
}

View File

@ -166,7 +166,9 @@ void MetricsHandler::handle_payload(Session *session, const CapInfoPayload &payl
void MetricsHandler::handle_payload(Session *session, const ReadLatencyPayload &payload) {
dout(20) << ": type=" << payload.get_type()
<< ", session=" << session << ", latency=" << payload.lat << dendl;
<< ", session=" << session << ", latency=" << payload.lat
<< ", avg=" << payload.mean << ", sq_sum=" << payload.sq_sum
<< ", count=" << payload.count << dendl;
auto it = client_metrics_map.find(session->info.inst);
if (it == client_metrics_map.end()) {
@ -176,12 +178,17 @@ void MetricsHandler::handle_payload(Session *session, const ReadLatencyPayload &
auto &metrics = it->second.second;
metrics.update_type = UPDATE_TYPE_REFRESH;
metrics.read_latency_metric.lat = payload.lat;
metrics.read_latency_metric.mean = payload.mean;
metrics.read_latency_metric.sq_sum = payload.sq_sum;
metrics.read_latency_metric.count = payload.count;
metrics.read_latency_metric.updated = true;
}
void MetricsHandler::handle_payload(Session *session, const WriteLatencyPayload &payload) {
dout(20) << ": type=" << payload.get_type()
<< ", session=" << session << ", latency=" << payload.lat << dendl;
<< ", session=" << session << ", latency=" << payload.lat
<< ", avg=" << payload.mean << ", sq_sum=" << payload.sq_sum
<< ", count=" << payload.count << dendl;
auto it = client_metrics_map.find(session->info.inst);
if (it == client_metrics_map.end()) {
@ -191,12 +198,17 @@ void MetricsHandler::handle_payload(Session *session, const WriteLatencyPayload
auto &metrics = it->second.second;
metrics.update_type = UPDATE_TYPE_REFRESH;
metrics.write_latency_metric.lat = payload.lat;
metrics.write_latency_metric.mean = payload.mean;
metrics.write_latency_metric.sq_sum = payload.sq_sum;
metrics.write_latency_metric.count = payload.count;
metrics.write_latency_metric.updated = true;
}
void MetricsHandler::handle_payload(Session *session, const MetadataLatencyPayload &payload) {
dout(20) << ": type=" << payload.get_type()
<< ", session=" << session << ", latency=" << payload.lat << dendl;
<< ", session=" << session << ", latency=" << payload.lat
<< ", avg=" << payload.mean << ", sq_sum=" << payload.sq_sum
<< ", count=" << payload.count << dendl;
auto it = client_metrics_map.find(session->info.inst);
if (it == client_metrics_map.end()) {
@ -206,6 +218,9 @@ void MetricsHandler::handle_payload(Session *session, const MetadataLatencyPaylo
auto &metrics = it->second.second;
metrics.update_type = UPDATE_TYPE_REFRESH;
metrics.metadata_latency_metric.lat = payload.lat;
metrics.metadata_latency_metric.mean = payload.mean;
metrics.metadata_latency_metric.sq_sum = payload.sq_sum;
metrics.metadata_latency_metric.count = payload.count;
metrics.metadata_latency_metric.updated = true;
}

View File

@ -73,6 +73,12 @@ namespace ceph {
CLIENT_METRIC_TYPE_OPENED_INODES, \
CLIENT_METRIC_TYPE_READ_IO_SIZES, \
CLIENT_METRIC_TYPE_WRITE_IO_SIZES, \
CLIENT_METRIC_TYPE_AVG_READ_LATENCY, \
CLIENT_METRIC_TYPE_STDEV_READ_LATENCY, \
CLIENT_METRIC_TYPE_AVG_WRITE_LATENCY, \
CLIENT_METRIC_TYPE_STDEV_WRITE_LATENCY, \
CLIENT_METRIC_TYPE_AVG_METADATA_LATENCY, \
CLIENT_METRIC_TYPE_STDEV_METADATA_LATENCY, \
}
#define CEPHFS_FEATURES_MDS_SUPPORTED CEPHFS_FEATURES_ALL

View File

@ -1104,6 +1104,12 @@ ceph_add_mds_perf_query(BaseMgrModule *self, PyObject *args)
{"opened_inodes", MDSPerformanceCounterType::OPENED_INODES_METRIC},
{"read_io_sizes", MDSPerformanceCounterType::READ_IO_SIZES_METRIC},
{"write_io_sizes", MDSPerformanceCounterType::WRITE_IO_SIZES_METRIC},
{"avg_read_latency", MDSPerformanceCounterType::AVG_READ_LATENCY_METRIC},
{"stdev_read_latency", MDSPerformanceCounterType::STDEV_READ_LATENCY_METRIC},
{"avg_write_latency", MDSPerformanceCounterType::AVG_WRITE_LATENCY_METRIC},
{"stdev_write_latency", MDSPerformanceCounterType::STDEV_WRITE_LATENCY_METRIC},
{"avg_metadata_latency", MDSPerformanceCounterType::AVG_METADATA_LATENCY_METRIC},
{"stdev_metadata_latency", MDSPerformanceCounterType::STDEV_METADATA_LATENCY_METRIC},
};
PyObject *py_query = nullptr;

View File

@ -35,6 +35,12 @@ void MDSPerformanceCounterDescriptor::pack_counter(
case MDSPerformanceCounterType::OPENED_INODES_METRIC:
case MDSPerformanceCounterType::READ_IO_SIZES_METRIC:
case MDSPerformanceCounterType::WRITE_IO_SIZES_METRIC:
case MDSPerformanceCounterType::AVG_READ_LATENCY_METRIC:
case MDSPerformanceCounterType::STDEV_READ_LATENCY_METRIC:
case MDSPerformanceCounterType::AVG_WRITE_LATENCY_METRIC:
case MDSPerformanceCounterType::STDEV_WRITE_LATENCY_METRIC:
case MDSPerformanceCounterType::AVG_METADATA_LATENCY_METRIC:
case MDSPerformanceCounterType::STDEV_METADATA_LATENCY_METRIC:
break;
default:
ceph_abort_msg("unknown counter type");
@ -57,6 +63,12 @@ void MDSPerformanceCounterDescriptor::unpack_counter(
case MDSPerformanceCounterType::OPENED_INODES_METRIC:
case MDSPerformanceCounterType::READ_IO_SIZES_METRIC:
case MDSPerformanceCounterType::WRITE_IO_SIZES_METRIC:
case MDSPerformanceCounterType::AVG_READ_LATENCY_METRIC:
case MDSPerformanceCounterType::STDEV_READ_LATENCY_METRIC:
case MDSPerformanceCounterType::AVG_WRITE_LATENCY_METRIC:
case MDSPerformanceCounterType::STDEV_WRITE_LATENCY_METRIC:
case MDSPerformanceCounterType::AVG_METADATA_LATENCY_METRIC:
case MDSPerformanceCounterType::STDEV_METADATA_LATENCY_METRIC:
break;
default:
ceph_abort_msg("unknown counter type");
@ -95,6 +107,24 @@ std::ostream& operator<<(std::ostream &os, const MDSPerformanceCounterDescriptor
case MDSPerformanceCounterType::WRITE_IO_SIZES_METRIC:
os << "write_io_sizes_metric";
break;
case MDSPerformanceCounterType::AVG_READ_LATENCY_METRIC:
os << "avg_read_latency";
break;
case MDSPerformanceCounterType::STDEV_READ_LATENCY_METRIC:
os << "stdev_read_latency";
break;
case MDSPerformanceCounterType::AVG_WRITE_LATENCY_METRIC:
os << "avg_write_latency";
break;
case MDSPerformanceCounterType::STDEV_WRITE_LATENCY_METRIC:
os << "stdev_write_latency";
break;
case MDSPerformanceCounterType::AVG_METADATA_LATENCY_METRIC:
os << "avg_metadata_latency";
break;
case MDSPerformanceCounterType::STDEV_METADATA_LATENCY_METRIC:
os << "stdev_metadata_latency";
break;
}
return os;

View File

@ -126,6 +126,12 @@ enum class MDSPerformanceCounterType : uint8_t {
OPENED_INODES_METRIC = 7,
READ_IO_SIZES_METRIC = 8,
WRITE_IO_SIZES_METRIC = 9,
AVG_READ_LATENCY_METRIC = 10,
STDEV_READ_LATENCY_METRIC = 11,
AVG_WRITE_LATENCY_METRIC = 12,
STDEV_WRITE_LATENCY_METRIC = 13,
AVG_METADATA_LATENCY_METRIC = 14,
STDEV_METADATA_LATENCY_METRIC = 15,
};
struct MDSPerformanceCounterDescriptor {
@ -143,6 +149,12 @@ struct MDSPerformanceCounterDescriptor {
case MDSPerformanceCounterType::OPENED_INODES_METRIC:
case MDSPerformanceCounterType::READ_IO_SIZES_METRIC:
case MDSPerformanceCounterType::WRITE_IO_SIZES_METRIC:
case MDSPerformanceCounterType::AVG_READ_LATENCY_METRIC:
case MDSPerformanceCounterType::STDEV_READ_LATENCY_METRIC:
case MDSPerformanceCounterType::AVG_WRITE_LATENCY_METRIC:
case MDSPerformanceCounterType::STDEV_WRITE_LATENCY_METRIC:
case MDSPerformanceCounterType::AVG_METADATA_LATENCY_METRIC:
case MDSPerformanceCounterType::STDEV_METADATA_LATENCY_METRIC:
return true;
default:
return false;

View File

@ -36,9 +36,15 @@ MDS_PERF_QUERY_COUNTERS_MAP = OrderedDict({'cap_hit': 0,
'pinned_icaps': 6,
'opened_inodes': 7,
'read_io_sizes': 8,
'write_io_sizes': 9})
'write_io_sizes': 9,
'avg_read_latency': 10,
'stdev_read_latency': 11,
'avg_write_latency': 12,
'stdev_write_latency': 13,
'avg_metadata_latency': 14,
'stdev_metadata_latency': 15})
MDS_PERF_QUERY_COUNTERS = [] # type: List[str]
MDS_GLOBAL_PERF_QUERY_COUNTERS = ['cap_hit', 'read_latency', 'write_latency', 'metadata_latency', 'dentry_lease', 'opened_files', 'pinned_icaps', 'opened_inodes', 'read_io_sizes', 'write_io_sizes'] # type: List[str]
MDS_GLOBAL_PERF_QUERY_COUNTERS = list(MDS_PERF_QUERY_COUNTERS_MAP.keys())
QUERY_EXPIRE_INTERVAL = timedelta(minutes=1)

View File

@ -7,6 +7,7 @@ import errno
import json
import signal
import time
import math
from collections import OrderedDict
from datetime import datetime
@ -30,6 +31,7 @@ class MetricType(Enum):
METRIC_TYPE_PERCENTAGE = 1
METRIC_TYPE_LATENCY = 2
METRIC_TYPE_SIZE = 3
METRIC_TYPE_STDEV = 4
FS_TOP_PROG_STR = 'cephfs-top'
@ -53,6 +55,11 @@ MAIN_WINDOW_TOP_LINE_ITEMS_START = [ITEMS_PAD,
FS_TOP_MAIN_WINDOW_COL_MNT_ROOT]
MAIN_WINDOW_TOP_LINE_ITEMS_END = [FS_TOP_MAIN_WINDOW_COL_MNTPT_HOST_ADDR]
MAIN_WINDOW_TOP_LINE_METRICS_LEGACY = ["READ_LATENCY",
"WRITE_LATENCY",
"METADATA_LATENCY"
]
# adjust this map according to stats version and maintain order
# as emitted by mgr/stast
MAIN_WINDOW_TOP_LINE_METRICS = OrderedDict([
@ -66,6 +73,12 @@ MAIN_WINDOW_TOP_LINE_METRICS = OrderedDict([
("OPENED_INODES", MetricType.METRIC_TYPE_NONE),
("READ_IO_SIZES", MetricType.METRIC_TYPE_SIZE),
("WRITE_IO_SIZES", MetricType.METRIC_TYPE_SIZE),
("AVG_READ_LATENCY", MetricType.METRIC_TYPE_LATENCY),
("STDEV_READ_LATENCY", MetricType.METRIC_TYPE_STDEV),
("AVG_WRITE_LATENCY", MetricType.METRIC_TYPE_LATENCY),
("STDEV_WRITE_LATENCY", MetricType.METRIC_TYPE_STDEV),
("AVG_METADATA_LATENCY", MetricType.METRIC_TYPE_LATENCY),
("STDEV_METADATA_LATENCY", MetricType.METRIC_TYPE_STDEV),
])
MGR_STATS_COUNTERS = list(MAIN_WINDOW_TOP_LINE_METRICS.keys())
@ -95,7 +108,14 @@ def calc_perc(c):
def calc_lat(c):
return round(c[0] + c[1] / 1000000000, 2)
return round(c[0] * 1000 + c[1] / 1000000, 2)
def calc_stdev(c):
stdev = 0.0
if c[1] > 1:
stdev = math.sqrt(c[0] / (c[1] - 1)) / 1000000
return round(stdev, 2)
# in MB
@ -228,6 +248,18 @@ class FSTop(object):
return "rtio"
if item == "WRITE_IO_SIZES":
return "wtio"
if item == 'AVG_READ_LATENCY':
return 'rlatavg'
if item == 'STDEV_READ_LATENCY':
return 'rlatsd'
if item == 'AVG_WRITE_LATENCY':
return 'wlatavg'
if item == 'STDEV_WRITE_LATENCY':
return 'wlatsd'
if item == 'AVG_METADATA_LATENCY':
return 'mlatavg'
if item == 'STDEV_METADATA_LATENCY':
return 'mlatsd'
else:
# return empty string for none type
return ''
@ -236,9 +268,11 @@ class FSTop(object):
if typ == MetricType.METRIC_TYPE_PERCENTAGE:
return "(%)"
elif typ == MetricType.METRIC_TYPE_LATENCY:
return "(s)"
return "(ms)"
elif typ == MetricType.METRIC_TYPE_SIZE:
return "(MB)"
elif typ == MetricType.METRIC_TYPE_STDEV:
return "(ms)"
else:
# return empty string for none type
return ''
@ -283,6 +317,8 @@ class FSTop(object):
xp += nlen
for item, typ in MAIN_WINDOW_TOP_LINE_METRICS.items():
if item in MAIN_WINDOW_TOP_LINE_METRICS_LEGACY:
continue
it = f'{self.items(item)}{self.mtype(typ)}'
heading.append(it)
nlen = len(it) + len(ITEMS_PAD)
@ -334,6 +370,7 @@ class FSTop(object):
def refresh_client(self, client_id, metrics, counters, client_meta, x_coord_map, y_coord):
global last_time
size = 0
cur_time = time.time()
duration = cur_time - last_time
last_time = cur_time
@ -364,6 +401,9 @@ class FSTop(object):
cidx = 0
client_id = x_coord_map[FS_TOP_MAIN_WINDOW_COL_CLIENT_ID]
for item in counters:
if item in MAIN_WINDOW_TOP_LINE_METRICS_LEGACY:
cidx += 1
continue
coord = x_coord_map[item]
hlen = coord[1] - len(ITEMS_PAD)
hlen = min(hlen, remaining_hlen)
@ -379,6 +419,8 @@ class FSTop(object):
self.mainw.addnstr(y_coord, coord[0], f'{calc_perc(m)}', hlen)
elif typ == MetricType.METRIC_TYPE_LATENCY:
self.mainw.addnstr(y_coord, coord[0], f'{calc_lat(m)}', hlen)
elif typ == MetricType.METRIC_TYPE_STDEV:
self.mainw.addnstr(y_coord, coord[0], f'{calc_stdev(m)}', hlen)
elif typ == MetricType.METRIC_TYPE_SIZE:
self.mainw.addnstr(y_coord, coord[0], f'{calc_size(m)}', hlen)