From cab7a43d5bb7841be03c7cf91f234ac64fb562a5 Mon Sep 17 00:00:00 2001 From: Renato Covarrubias Date: Thu, 4 Jan 2018 16:30:04 -0300 Subject: [PATCH] Speed up docker_check with parallel check --- docker_check.py | 211 +++++++++++++++++++++++++++++++----------------- 1 file changed, 139 insertions(+), 72 deletions(-) diff --git a/docker_check.py b/docker_check.py index 63bf286..646eb21 100755 --- a/docker_check.py +++ b/docker_check.py @@ -7,6 +7,16 @@ import sys import argparse +import logging +import queue +import threading + +logging.basicConfig( + format='%(asctime)s level=%(levelname)-7s ' + 'threadName=%(threadName)s name=%(name)s %(message)s', + level=logging.INFO +) + try: import docker except ImportError as error: @@ -19,34 +29,40 @@ __version__ = '0.1' -def get_mem_pct(container, stats): +def get_ct_stats(container): + '''Get container status''' + return container.stats(stream=False) + + +def get_mem_pct(stats): '''Get a container memory usage in %''' - mem = stats[container]['memory_stats'] - usage = mem['usage'] - limit = mem['limit'] + usage = stats['memory_stats']['usage'] + limit = stats['memory_stats']['limit'] return round(usage * 100 / limit, 2) -def get_cpu_pct(container): - '''Get a container cpu usage in % via docker stats cmd''' - usage = str( - os.popen("docker stats --no-stream=true " + container).read() - ).split() - usage_pct = usage[usage.index(container) + 1] - return float(usage_pct[:-1]) +def get_cpu_pct(stats): + '''Get a container cpu usage in %''' + cpu_delta = stats['cpu_stats']['cpu_usage']['total_usage'] - \ + stats['precpu_stats']['cpu_usage']['total_usage'] + system_delta = stats['cpu_stats']['system_cpu_usage'] - \ + stats['precpu_stats']['system_cpu_usage'] + online_cpus = stats['cpu_stats']['online_cpus'] + if cpu_delta > 0 and system_delta > 0: + return (cpu_delta / system_delta) * online_cpus * 100 + return 0.0 -def get_net_io(container, stats): +def get_net_io(stats): '''Get a container Net In / Out usage since it's launche''' - net = stats[container]['networks'] - net_in = net['eth0']['rx_bytes'] - net_out = net['eth0']['tx_bytes'] - return [net_in, net_out] + net_in = stats['networks']['eth0']['rx_bytes'] + net_out = stats['networks']['eth0']['tx_bytes'] + return net_in, net_out -def get_disk_io(container, stats): +def get_disk_io(stats): '''Get a container Disk In / Out usage since it's launche''' - disk = stats[container]['blkio_stats']['io_service_bytes_recursive'] + disk = stats['blkio_stats']['io_service_bytes_recursive'] try: disk_in = disk[0]['value'] except IndexError: @@ -58,9 +74,54 @@ def get_disk_io(container, stats): return disk_in, disk_out -def get_ct_stats(container, client): - '''Get container status''' - return client.containers.get(container).stats(stream=False) +def get_ct_metrics(container_queue, containers_stats): + '''Get container metrics from docker API''' + logging.debug("Running get_ct_metrics()") + while not container_queue.empty(): + container = container_queue.get() + logging.debug("Get container %s stats", container.id) + stats = get_ct_stats(container) + + mem_pct = get_mem_pct(stats) + cpu_pct = get_cpu_pct(stats) + net_in, net_out = get_net_io(stats) + disk_in, disk_out = get_disk_io(stats) + + containers_stats['%s_mem_pct' % container.id] = mem_pct + containers_stats['%s_cpu_pct' % container.id] = cpu_pct + containers_stats['%s_net_in' % container.id] = net_in + containers_stats['%s_net_out' % container.id] = net_out + containers_stats['%s_disk_in' % container.id] = disk_in + containers_stats['%s_disk_out' % container.id] = disk_out + + container_queue.task_done() + logging.debug("Done with container %s stats", container.id) + logging.debug("End get_ct_metrics()") + + +def get_ct_stats_message(containers_stats): + '''Get check message from containers stats''' + return ', '.join( + [ + "%s have %.2f%% %s" % ( + k.split('_')[0][:12], + v, + k.split('_')[1]) + for k, v + in containers_stats.items() + ] + ) + + +def get_ct_perfdata_message(containers_stats): + '''Get perfdata message from containers stats''' + return ' '.join( + [ + "%s_%s=%s" % (k[:12], "_".join(k.split("_")[1:3]), v) + for k, v + in containers_stats.items() + ] + ) def main(): @@ -74,65 +135,71 @@ def main(): # Try to use the lastest API version otherwise use # the installed client API version + # Get list of running containers try: - docker.from_env().containers.list() + containers_list = docker.from_env().containers.list() client = docker.from_env() except docker.errors.APIError as error: version = re.sub('[^0-9.]+', '', str(error).split('server API version:')[1]) client = docker.from_env(version=version) - # Get list of running containers - containers_list = client.containers.list() - containers = [] - # If cid is True containers IDs will be used, otherwise names - cid = False - for i in containers_list: - cid = str(i).replace('<', '').replace('>', '').split()[1] - if cid: - containers.append(cid) - else: - containers.append( - os.popen("docker ps -f id=" + cid).read().split()[-1]) - # Get stats and metrics - summary = '' - stats = {} - metrics = [0, 0] - ct_stats = {} - for i in containers: - ct_stats[i] = get_ct_stats(i, client) - mem_pct = get_mem_pct(i, ct_stats) - cpu_pct = get_cpu_pct(i) - net_in = get_net_io(i, ct_stats)[0] - net_out = get_net_io(i, ct_stats)[1] - disk_in = get_disk_io(i, ct_stats)[0] - disk_out = get_disk_io(i, ct_stats)[1] - stats[i + '_mem_pct'] = mem_pct - stats[i + '_cpu_pct'] = cpu_pct - summary += '{}_mem_pct={}% {}_cpu_pct={}% {}_net_in={} {}_net_out={} '\ - '{}_disk_in={} {}_disk_out={} '.format( - i, mem_pct, i, cpu_pct, i, net_in, i, net_out, i, - disk_in, i, disk_out) - # Get the highest % use - for stat in stats: - if stats[stat] >= metrics[1]: - metrics[0] = stat - metrics[1] = stats[stat] + containers_list = client.containers.list() + + logging.debug("containers_list = %s", containers_list) + + # START + containers_queue = queue.Queue() + for container in containers_list: + containers_queue.put(container) + + containers_stats = {} + + # Set up some threads to fetch the enclosures + for th_id in range(len(containers_list)): + worker = threading.Thread( + target=get_ct_metrics, + args=(containers_queue, containers_stats,), + name='worker-{}'.format(th_id), + ) + worker.setDaemon(True) + worker.start() + + containers_queue.join() + + logging.debug("containers_stats = %s", containers_stats) + stats = { + k: v + for k, v + in containers_stats.items() + if k.endswith('_mem_pct') or k.endswith('_cpu_pct') + } + logging.debug("stats = %s", stats) + # Check stats values and output perfdata - if metrics[1] < args.warning: - print("OK | {}".format(summary)) - sys.exit(0) - elif args.warning <= metrics[1] <= args.critical: - print("WARNING: Some containers need your attention: {} have {}%'\ - ' | {}".format(metrics[0], metrics[1], summary)) - sys.exit(1) - elif metrics[1] > 80: - print("CRITICAL: Some containers need your attention: {} have {}%'\ - ' | {}".format(metrics[0], metrics[1], summary)) + critical_ct = {k: v for k, v in stats.items() if v > args.critical} + if critical_ct: + print("CRITICAL: %s | %s" % ( + get_ct_stats_message(critical_ct), + get_ct_perfdata_message(containers_stats))) sys.exit(2) - else: - print("UKNOWN | {}".format(summary)) - sys.exit(3) + + warning_ct = {k: v for k, v in stats.items() if v > args.warning} + if warning_ct: + print("WARNING: %s | %s" % ( + get_ct_stats_message(warning_ct), + get_ct_perfdata_message(containers_stats))) + sys.exit(1) + + print("OK | %s" % get_ct_perfdata_message(containers_stats)) + sys.exit(0) if __name__ == '__main__': - main() + try: + main() + except BaseException as exc: + EXC_TYPE, _, EXC_TRACEBACK = sys.exc_info() + FNAME = os.path.split(EXC_TRACEBACK.tb_frame.f_code.co_filename)[1] + print("UNKNOWN: %s Exception \"%s\" in %s line %s" % ( + EXC_TYPE.__name__, exc, FNAME, EXC_TRACEBACK.tb_lineno)) + sys.exit(3)