From 95d6a447cbd8a9df98ecc6851026f1ec5b9ff857 Mon Sep 17 00:00:00 2001 From: Li Ning Date: Fri, 15 Jun 2018 17:10:24 +0800 Subject: [PATCH 1/7] New command to start ceph-mgr. Fix a bug when calls translate_to_id() function. Signed-off-by: Li Ning --- deploy/mod/deploy.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/deploy/mod/deploy.py b/deploy/mod/deploy.py index 7f877da..c280d39 100644 --- a/deploy/mod/deploy.py +++ b/deploy/mod/deploy.py @@ -95,7 +95,7 @@ def __init__(self, tunings=""): self.cluster["mdss"][mds] = ip_handler.getIpByHostInSubnet(mds) for osd in self.cluster["osds"]: - devices_id = self.translate_to_id(self.all_conf_data.get_list(osd)) + devices_id = self.translate_to_id(osd, self.all_conf_data.get_list(osd)) self.cluster[osd] = devices_id self.cluster["fs"] = "xfs" @@ -913,12 +913,13 @@ def start_osd(self): def start_mgr(self, force=False): user = self.cluster["user"] head = self.cluster["head"] - outStr, stderr = common.pdsh(user, [head], "ceph status --format json", "check_return") + outStr, stderr = common.pdsh(user, [head], "ceph status --format json-pretty", "check_return") formatted_outStr = common.format_pdsh_return(outStr) ceph_status = formatted_outStr[head] - #outList = [x.strip() for x in outStr.split('\n')] + common.pdsh(user, [head], "mkdir -p /var/lib/ceph/mgr/", option="console") + if "no active mgr" in outStr: - common.pdsh(user, [head], "ceph auth get-or-create mgr.admin mon 'allow *' && ceph-mgr -i %s" % ceph_status["fsid"], option="console") + common.pdsh(user, [head], "ceph auth get-or-create mgr.admin mon 'allow profile mgr' osd 'allow *' mds 'allow *' 2>/dev/null 1>/var/lib/ceph/mgr/ceph-admin && ceph-mgr -i admin", option="console") common.printout("LOG", "create mgr success: admin") else: common.printout("LOG", "not need create mgr") From b7d7ec455af492fb2b0f9d2bed51860d809e98cf Mon Sep 17 00:00:00 2001 From: Li Ning Date: Wed, 8 May 2019 19:21:18 -0700 Subject: [PATCH 2/7] Setup ceph-mgr daemon Signed-off-by: Li Ning --- deploy/mod/deploy.py | 1 + 1 file changed, 1 insertion(+) diff --git a/deploy/mod/deploy.py b/deploy/mod/deploy.py index c280d39..2b962ff 100644 --- a/deploy/mod/deploy.py +++ b/deploy/mod/deploy.py @@ -917,6 +917,7 @@ def start_mgr(self, force=False): formatted_outStr = common.format_pdsh_return(outStr) ceph_status = formatted_outStr[head] common.pdsh(user, [head], "mkdir -p /var/lib/ceph/mgr/", option="console") + common.pdsh(user, [head], "ceph auth get-or-create mgr.admin mon 'allow profile mgr' osd 'allow *' mds 'allow *' 2>/dev/null 1>/var/lib/ceph/mgr/ceph-admin && ceph-mgr -i admin", option="console") if "no active mgr" in outStr: common.pdsh(user, [head], "ceph auth get-or-create mgr.admin mon 'allow profile mgr' osd 'allow *' mds 'allow *' 2>/dev/null 1>/var/lib/ceph/mgr/ceph-admin && ceph-mgr -i admin", option="console") From fa55766b365afa87338ad3eda0684f15d30e6c12 Mon Sep 17 00:00:00 2001 From: Li Ning Date: Wed, 8 May 2019 20:17:31 -0700 Subject: [PATCH 3/7] Fix "application not enabled" error Signed-off-by: Li Ning --- benchmarking/mod/benchmark.py | 1 + 1 file changed, 1 insertion(+) diff --git a/benchmarking/mod/benchmark.py b/benchmarking/mod/benchmark.py index 08d3b70..6473c02 100644 --- a/benchmarking/mod/benchmark.py +++ b/benchmarking/mod/benchmark.py @@ -178,6 +178,7 @@ def run(self): common.pdsh(user, nodes, "sync && echo '%s' > /proc/sys/vm/drop_caches" % self.cluster["cache_drop_level"]) #send command to ceph cluster + common.pdsh(user, [head], "ceph osd pool application enable rbd rbd", option="console") common.pdsh(user, nodes, "for i in `seq 1 %d`;do echo `date \"+%s\"` `ceph health` >> %s/`hostname`_ceph_health.txt; sleep %s;done" % (time_tmp/int(monitor_interval)+1, "%Y_%m_%d %H:%M:%S", dest_dir, monitor_interval), option="force") common.pdsh(user, nodes, "ps aux | grep ceph-osd | grep -v 'grep' > %s/`hostname`_ps.txt" % (dest_dir)) common.pdsh(user, nodes, "date > %s/`hostname`_process_log.txt" % (dest_dir)) From 3375c6e9f67b01c5e0b3903d8494206a36d8f9b5 Mon Sep 17 00:00:00 2001 From: Li Ning Date: Wed, 8 May 2019 20:25:48 -0700 Subject: [PATCH 4/7] Fix the ceph status output error Signed-off-by: Li Ning --- conf/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conf/common.py b/conf/common.py index 1539643..c4e60e2 100644 --- a/conf/common.py +++ b/conf/common.py @@ -642,7 +642,7 @@ def get_ceph_health(user, node): res = format_pdsh_return(stdout) if len(res): stdout = res[node] - output["ceph_status"] = stdout['health']['overall_status'] + output["ceph_status"] = stdout['health']['status'] output["detail"] = stdout['health']['checks'] if "write_bytes_sec" in stdout['pgmap']: str_wb = str(stdout['pgmap']['write_bytes_sec'] / 1024 / 1024) + ' MB/s wr, ' From 962ee1bb17000d4b4d784b6e0a02931d7c6dc886 Mon Sep 17 00:00:00 2001 From: Li Ning Date: Wed, 8 May 2019 22:18:38 -0700 Subject: [PATCH 5/7] Remove mon_pg_warn_max_per_osd from default conf Signed-off-by: Li Ning --- conf/handler.py | 1 - 1 file changed, 1 deletion(-) diff --git a/conf/handler.py b/conf/handler.py index facb3db..4c66c3c 100644 --- a/conf/handler.py +++ b/conf/handler.py @@ -222,7 +222,6 @@ def list_required_config(self): required_list["system"]["disk|read_ahead_kb"] = 2048 required_list["ceph_tuning"] = OrderedDict() required_list["ceph_tuning"]["pool|rbd|size"] = 2 - required_list["ceph_tuning"]["global|mon_pg_warn_max_per_osd"] = 1000 required_list["analyzer"] = OrderedDict() required_list["analyzer"]["analyzer"] = "all" From 98289035bd14b666b4b4beb10cd5723f091d3cf5 Mon Sep 17 00:00:00 2001 From: Li Ning Date: Wed, 8 May 2019 22:26:33 -0700 Subject: [PATCH 6/7] Fix the device name of read_ahead_kb Signed-off-by: Li Ning --- tuner/tuner.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/tuner/tuner.py b/tuner/tuner.py index 76342c0..217a7ec 100644 --- a/tuner/tuner.py +++ b/tuner/tuner.py @@ -52,9 +52,10 @@ def handle_disk(self, option="get", param={'read_ahead_kb':2048, 'max_sectors_kb for osd in osds: for device in self.cluster[osd]: parsed_device_name = common.parse_device_name(device) + parsed_device_name = 'loop' + re.findall(r'\d', parsed_device_name)[0] tmp = {} for key, value in param.items(): - stdout, stderr = common.pdsh(user, [osd], 'sh -c "cat /sys/block/%s/queue/%s"' % (parsed_device_name, key), option="check_return") + stdout, stderr = common.pdsh(user, [osd], 'sh -c "cat /sys/devices/virtual/block/%s/queue/%s"' % (parsed_device_name, key), option="check_return") res = common.format_pdsh_return(stdout) tmp[key] = res[osd] stdout, stderr = common.pdsh(user, [osd], 'xfs_info %s' % (device), option="check_return") @@ -69,8 +70,10 @@ def handle_disk(self, option="get", param={'read_ahead_kb':2048, 'max_sectors_kb for osd in osds: for device in self.cluster[osd]: parsed_device_name = common.parse_device_name(device) + parsed_device_name = 'loop' + re.findall(r'\d', parsed_device_name)[0] for key, value in param.items(): - stdout, stderr = common.pdsh(user, [osd], 'sh -c "echo %s > /sys/block/%s/queue/%s"' % (str(value), parsed_device_name, key), option="check_return") + stdout, stderr = common.pdsh(user, [osd], 'sh -c "echo %s > /sys/devices/virtual/block/%s/queue/%s"' % (str(value), parsed_device_name, key), option="check_return") + common.printout("LOG", "change the read_ahead_kb to %s, device name = %s, key = %s" % (value, parsed_device_name, key)) def get_version(self): common.printout("LOG"," Test start running function : %s"%(self.__class__.__name__,sys._getframe().f_code.co_name),screen=False,log_level="LVL4") @@ -270,6 +273,7 @@ def apply_tuning(self, jobname, no_check = False): else: tmp_tuning_diff = ['global'] + common.printout("LOG","tmp_tuning_diff = %s" % tmp_tuning_diff) if 'disk' in tmp_tuning_diff: param = {} for param_name, param_data in self.worksheet[jobname]['disk'].items(): From 6f6b0ffdbabf3cf2ac746dbb52252360feff170d Mon Sep 17 00:00:00 2001 From: Li Ning Date: Wed, 8 May 2019 22:59:48 -0700 Subject: [PATCH 7/7] Support read,write latency display separately Signed-off-by: Li Ning --- analyzer/analyzer.py | 78 ++++++++++++++++++++++++++++++++------------ 1 file changed, 58 insertions(+), 20 deletions(-) diff --git a/analyzer/analyzer.py b/analyzer/analyzer.py index 3b52dab..b08d7bd 100644 --- a/analyzer/analyzer.py +++ b/analyzer/analyzer.py @@ -430,6 +430,12 @@ def summary_result(self, data): max_lat = 0 max_lat_95 = 0 max_lat_99 = 0 + max_write_lat = 0 + max_read_lat = 0 + max_write_lat_95 = 0 + max_read_lat_95 = 0 + max_write_lat_99 = 0 + max_read_lat_99 = 0 for engine_candidate in data["workload"].keys(): if engine_candidate in benchmark_tool: engine = engine_candidate @@ -441,28 +447,43 @@ def summary_result(self, data): write_IOPS += float(node_data["write_iops"]) write_BW += float(node_data["write_bw"]) write_Latency += float(node_data["write_lat"]) - max_lat_95 += float(node_data["95.00th%_lat"]) - max_lat_99 += float(node_data["99.00th%_lat"]) - max_lat += float(node_data["99.99th%_lat"]) + #max_lat_95 += float(node_data["95.00th%_lat"]) + #max_lat_99 += float(node_data["99.00th%_lat"]) + #max_lat += float(node_data["99.99th%_lat"]) + max_write_lat += float(node_data["99.99th%_write_lat"]) + max_read_lat += float(node_data["99.99th%_read_lat"]) + max_write_lat_95 += float(node_data["95.00th%_write_lat"]) + max_read_lat_95 += float(node_data["95.00th%_read_lat"]) + max_write_lat_99 += float(node_data["99.00th%_write_lat"]) + max_read_lat_99 += float(node_data["99.00th%_read_lat"]) if tmp_data["Op_Type"] in ["randread", "seqread", "read"]: tmp_data["IOPS"] = "%.3f" % read_IOPS tmp_data["BW(MB/s)"] = "%.3f" % read_BW if rbd_count > 0: tmp_data["Latency(ms)"] = "%.3f" % (read_Latency/rbd_count) + tmp_data["95.00th%_lat(ms)"] = "%.3f" % (max_read_lat_95/rbd_count) + tmp_data["99.00th%_lat(ms)"] = "%.3f" % (max_read_lat_99/rbd_count) + tmp_data["99.99th%_lat(ms)"] = "%.3f" % (max_read_lat/rbd_count) elif tmp_data["Op_Type"] in ["randwrite", "seqwrite", "write"]: tmp_data["IOPS"] = "%.3f" % write_IOPS tmp_data["BW(MB/s)"] = "%.3f" % write_BW if rbd_count > 0: tmp_data["Latency(ms)"] = "%.3f" % (write_Latency/rbd_count) + tmp_data["95.00th%_lat(ms)"] = "%.3f" % (max_write_lat_95/rbd_count) + tmp_data["99.00th%_lat(ms)"] = "%.3f" % (max_write_lat_99/rbd_count) + tmp_data["99.99th%_lat(ms)"] = "%.3f" % (max_write_lat/rbd_count) elif tmp_data["Op_Type"] in ["randrw", "rw", "readwrite"]: tmp_data["IOPS"] = "%.3f, %.3f" % (read_IOPS, write_IOPS) tmp_data["BW(MB/s)"] = "%.3f, %.3f" % (read_BW, write_BW) if rbd_count > 0: tmp_data["Latency(ms)"] = "%.3f, %.3f" % ((read_Latency/rbd_count), (write_Latency/rbd_count)) - if rbd_count > 0: - tmp_data["95.00th%_lat(ms)"] = "%.3f" % (max_lat_95/rbd_count) - tmp_data["99.00th%_lat(ms)"] = "%.3f" % (max_lat_99/rbd_count) - tmp_data["99.99th%_lat(ms)"] = "%.3f" % (max_lat/rbd_count) + tmp_data["95.00th%_lat(ms)"] = "%.3f, %.3f" % ((max_read_lat_95/rbd_count), (max_write_lat_95/rbd_count)) + tmp_data["99.00th%_lat(ms)"] = "%.3f, %.3f" % ((max_read_lat_99/rbd_count), (max_write_lat_99/rbd_count)) + tmp_data["99.99th%_lat(ms)"] = "%.3f, %.3f" % ((max_read_lat/rbd_count), (max_write_lat/rbd_count)) +# if rbd_count > 0: +# tmp_data["95.00th%_lat(ms)"] = "%.3f" % (max_lat_95/rbd_count) +# tmp_data["99.00th%_lat(ms)"] = "%.3f" % (max_lat_99/rbd_count) +# tmp_data["99.99th%_lat(ms)"] = "%.3f" % (max_lat/rbd_count) except: err_log = traceback.format_exc() common.printout("ERROR","%s" % err_log) @@ -474,7 +495,7 @@ def summary_result(self, data): write_SN_Latency = 0 diskformat = common.parse_disk_format( self.cluster['diskformat'] ) if len(diskformat): - typename = diskformat[0] + typename = diskformat[1] else: typename = "osd" for node, node_data in data["ceph"][typename]['summary'].items(): @@ -882,8 +903,8 @@ def process_iostat_data(self, node, path): disk_list=[] for osd_journal in common.get_list(self.all_conf_data.get_list(node)): tmp_dev_name = osd_journal[i].split('/')[2] - if 'nvme' in tmp_dev_name: - tmp_dev_name = common.parse_nvme( tmp_dev_name ) + #if 'nvme' in tmp_dev_name: + # tmp_dev_name = common.parse_nvme( tmp_dev_name ) if tmp_dev_name not in disk_list: disk_list.append( tmp_dev_name ) dict_diskformat[output_list[i]]=disk_list @@ -963,11 +984,15 @@ def process_fio_data(self, path, dirname): result = {} try: stdout, stderr = common.bash("grep \" IOPS=.*BW=.*\| *io=.*bw=.*iops=.*runt=.*\|^ *lat.*min=.*max=.*avg=.*stdev=.*\" "+path, True) - stdout1, stderr1 = common.bash("grep \" *1.00th.*],\| *30.00th.*],\| *70.00th.*],\| *99.00th.*],\| *99.99th.*]\" "+path, True) + stdout1_read, stderr1_read = common.bash("grep \" *1.00th.*],\| *30.00th.*],\| *70.00th.*],\| *99.00th.*],\| *99.99th.*]\" "+path+" | head -5", True) + stdout1_write, stderr1_write = common.bash("grep \" *1.00th.*],\| *30.00th.*],\| *70.00th.*],\| *99.00th.*],\| *99.99th.*]\" "+path+" | tail -5", True) stdout2, stderr2 = common.bash("grep \" *clat percentiles\" "+path, True) - lat_per_dict = {} - if stdout1 != '': - lat_per_dict = self.get_lat_persent_dict(stdout1) + lat_per_dict_read = {} + lat_per_dict_write = {} + if stdout1_read != '': + lat_per_dict_read = self.get_lat_persent_dict(stdout1_read) + if stdout1_write != '': + lat_per_dict_write = self.get_lat_persent_dict(stdout1_write) fio_data_rw = {} fio_data_rw["read"] = {} @@ -999,17 +1024,30 @@ def process_fio_data(self, path, dirname): output_fio_data['write_iops'] = 0 output_fio_data['write_bw'] = 0 output_fio_data['write_runtime'] = 0 - - if len(lat_per_dict) != 0: + + if len(lat_per_dict_read) != 0: for tmp_key in ["95.00th", "99.00th", "99.99th"]: - if tmp_key in lat_per_dict.keys(): + if tmp_key in lat_per_dict_read.keys(): lat_persent_unit = re.findall(r"(?<=[\(])[^\)]+(?=[\)])", stdout2.strip('\n').strip(' ').replace(' ','')) if len(lat_persent_unit) != 0: - output_fio_data[tmp_key+"%_lat"] = float(common.time_to_sec("%s%s" % (lat_per_dict[tmp_key], lat_persent_unit[0]),'msec')) + output_fio_data[tmp_key+"%_read_lat"] = float(common.time_to_sec("%s%s" % (lat_per_dict_read[tmp_key], lat_persent_unit[0]), 'msec')) + else: + output_fio_data[tmp_key+"%_read_lat"] = 'null' + else: + output_fio_data[tmp_key+"%_read_lat"] = 'null' + + if len(lat_per_dict_write) != 0: + for tmp_key in ["95.00th", "99.00th", "99.99th"]: + if tmp_key in lat_per_dict_write.keys(): + lat_persent_unit = re.findall(r"(?<=[\(])[^\)]+(?=[\)])", stdout2.strip('\n').strip(' ').replace(' ','')) + if len(lat_persent_unit) == 1: + output_fio_data[tmp_key+"%_write_lat"] = float(common.time_to_sec("%s%s" % (lat_per_dict_write[tmp_key], lat_persent_unit[0]), 'msec')) + elif len(lat_persent_unit) == 2: + output_fio_data[tmp_key+"%_write_lat"] = float(common.time_to_sec("%s%s" % (lat_per_dict_write[tmp_key], lat_persent_unit[1]), 'msec')) else: - output_fio_data[tmp_key+"%_lat"] = 'null' + output_fio_data[tmp_key+"%_write_lat"] = 'null' else: - output_fio_data[tmp_key+"%_lat"] = 'null' + output_fio_data[tmp_key+"%_write_lat"] = 'null' output_fio_data['lat_unit'] = 'msec' output_fio_data['runtime_unit'] = 'sec' output_fio_data['bw_unit'] = 'MB/s'