diff --git a/docker_check.py b/docker_check.py index ae2219a..fdbc510 100755 --- a/docker_check.py +++ b/docker_check.py @@ -1,14 +1,27 @@ #!/usr/bin/env python3 + +"docker_check.py is a nagios compatible plugin to check docker containers." + import os import re import sys import argparse +import logging +import queue +import threading + +logging.basicConfig( + format='%(asctime)s level=%(levelname)-7s ' + 'threadName=%(threadName)s name=%(name)s %(message)s', + level=logging.INFO +) + try: import docker -except ImportError as e: +except ImportError as error: print("{}: Please install the docker module, you can use' \ - ''pip install docker' to do that".format(e)) + ''pip install docker' to do that".format(error)) sys.exit(1) __author__ = 'El Acheche Anis' @@ -16,106 +29,181 @@ __version__ = '0.1' -def get_mem_pct(ct, stats): +def get_ct_stats(container): + '''Get container status''' + return container.stats(stream=False) + + +def get_mem_pct(stats): '''Get a container memory usage in %''' - mem = stats[ct]['memory_stats'] - usage = mem['usage'] - limit = mem['limit'] - return round(usage*100/limit, 2) + usage = stats['memory_stats']['usage'] + limit = stats['memory_stats']['limit'] + return round(usage * 100 / limit, 2) -def get_cpu_pct(ct): - '''Get a container cpu usage in % via docker stats cmd''' - usage = str(os.popen("docker stats --no-stream=true "+ct).read()).split() - usage_pct = usage[usage.index(ct)+1] - return float(usage_pct[:-1]) +def get_cpu_pct(stats): + '''Get a container cpu usage in %''' + cpu_delta = stats['cpu_stats']['cpu_usage']['total_usage'] - \ + stats['precpu_stats']['cpu_usage']['total_usage'] + system_delta = stats['cpu_stats']['system_cpu_usage'] - \ + stats['precpu_stats']['system_cpu_usage'] + try: + online_cpus = stats['cpu_stats']['online_cpus'] + except KeyError: + online_cpus = len([item + for item in stats['cpu_stats']['percpu_usage'] + if item > 0]) + if cpu_delta > 0 and system_delta > 0: + return (cpu_delta / system_delta) * online_cpus * 100 + return 0.0 -def get_net_io(ct, stats): +def get_net_io(stats): '''Get a container Net In / Out usage since it's launche''' - net = stats[ct]['networks'] - net_in = net['eth0']['rx_bytes'] - net_out = net['eth0']['tx_bytes'] - return [net_in, net_out] + net_in = stats['networks']['eth0']['rx_bytes'] + net_out = stats['networks']['eth0']['tx_bytes'] + return net_in, net_out -def get_disk_io(ct, stats): +def get_disk_io(stats): '''Get a container Disk In / Out usage since it's launche''' - disk = stats[ct]['blkio_stats']['io_service_bytes_recursive'] - disk_in = disk[0]['value'] - disk_out = disk[1]['value'] + disk = stats['blkio_stats']['io_service_bytes_recursive'] + try: + disk_in = disk[0]['value'] + except IndexError: + disk_in = 0 + try: + disk_out = disk[1]['value'] + except IndexError: + disk_out = 0 return disk_in, disk_out -def get_ct_stats(ct, client): - return client.containers.get(ct).stats(stream=False) +def get_ct_metrics(container_queue, containers_stats): + '''Get container metrics from docker API''' + logging.debug("Running get_ct_metrics()") + while not container_queue.empty(): + container = container_queue.get() + logging.debug("Get container %s stats", container.name) + stats = get_ct_stats(container) + + mem_pct = get_mem_pct(stats) + cpu_pct = get_cpu_pct(stats) + net_in, net_out = get_net_io(stats) + disk_in, disk_out = get_disk_io(stats) + + containers_stats['%s_mem_pct' % container.name] = mem_pct + containers_stats['%s_cpu_pct' % container.name] = cpu_pct + containers_stats['%s_net_in' % container.name] = net_in + containers_stats['%s_net_out' % container.name] = net_out + containers_stats['%s_disk_in' % container.name] = disk_in + containers_stats['%s_disk_out' % container.name] = disk_out + + container_queue.task_done() + logging.debug("Done with container %s stats", container.name) + logging.debug("End get_ct_metrics()") + + +def get_ct_stats_message(containers_stats): + '''Get check message from containers stats''' + return ', '.join( + [ + "%s have %.2f%%" % (k, v) + for k, v + in containers_stats.items() + ] + ) + + +def get_ct_perfdata_message(containers_stats): + '''Get perfdata message from containers stats''' + return ' '.join( + [ + "%s=%s" % (k, v) + for k, v + in containers_stats.items() + ] + ) def main(): + '''Scripts main function''' parser = argparse.ArgumentParser(description='Check docker processes.') - parser.add_argument('-w', '--warning', type=int, help='warning percentage (default 50)', default=50) - parser.add_argument('-c', '--critical', type=int, help='critcal percentage (default 80)', default=80) + parser.add_argument('-w', '--warning', type=int, + help='warning percentage (default 50)', default=50) + parser.add_argument('-c', '--critical', type=int, + help='critcal percentage (default 80)', default=80) args = parser.parse_args() - '''Try to use the lastest API version otherwise use - the installed client API version - ''' + # Try to use the lastest API version otherwise use + # the installed client API version + # Get list of running containers try: - docker.from_env().containers.list() + containers_list = docker.from_env().containers.list() client = docker.from_env() - except docker.errors.APIError as e: - v = re.sub('[^0-9.]+', '', str(e).split('server API version:')[1]) - client = docker.from_env(version=v) - # Get list of running containers - ls = client.containers.list() - ct = [] - # If cid is True containers IDs will be used, otherwise names - cid = False - for i in ls: - c = str(i).replace('<', '').replace('>', '').split()[1] - if cid: - ct.append(c) - else: - ct.append(os.popen("docker ps -f id="+c).read().split()[-1]) - # Get stats and metrics - summary = '' - stats = {} - metrics = [0, 0] - ct_stats = {} - for i in ct: - ct_stats[i] = get_ct_stats(i, client) - mem_pct = get_mem_pct(i, ct_stats) - cpu_pct = get_cpu_pct(i) - net_in = get_net_io(i, ct_stats)[0] - net_out = get_net_io(i, ct_stats)[1] - disk_in = get_disk_io(i, ct_stats)[0] - disk_out = get_disk_io(i, ct_stats)[1] - stats[i+'_mem_pct'] = mem_pct - stats[i+'_cpu_pct'] = cpu_pct - summary += '{}_mem_pct={}% {}_cpu_pct={}% {}_net_in={} {}_net_out={} '\ - '{}_disk_in={} {}_disk_out={} '.format( - i, mem_pct, i, cpu_pct, i, net_in, i, net_out, i, disk_in, - i, disk_out) - # Get the highest % use - for s in stats: - if stats[s] >= metrics[1]: - metrics[0] = s - metrics[1] = stats[s] + except docker.errors.APIError as error: + version = re.sub('[^0-9.]+', '', + str(error).split('server API version:')[1]) + client = docker.from_env(version=version) + containers_list = client.containers.list() + + logging.debug("containers_list = %s", containers_list) + + # START + containers_queue = queue.Queue() + for container in containers_list: + containers_queue.put(container) + + containers_stats = {} + + # Set up some threads to fetch the enclosures + for th_id in range(len(containers_list)): + worker = threading.Thread( + target=get_ct_metrics, + args=(containers_queue, containers_stats,), + name='worker-{}'.format(th_id), + ) + worker.setDaemon(True) + worker.start() + + containers_queue.join() + + logging.debug("containers_stats = %s", containers_stats) + stats = { + k: v + for k, v + in containers_stats.items() + if k.endswith('_mem_pct') or k.endswith('_cpu_pct') + } + logging.debug("stats = %s", stats) + # Check stats values and output perfdata - if metrics[1] < args.warning: - print("OK | {}".format(summary)) - sys.exit(0) - elif args.warning <= metrics[1] <= args.critical: - print("WARNING: Some containers need your attention: {} have {}%'\ - ' | {}".format(metrics[0], metrics[1], summary)) - sys.exit(1) - elif metrics[1] > 80: - print("CRITICAL: Some containers need your attention: {} have {}%'\ - ' | {}".format(metrics[0], metrics[1], summary)) - sys.exit(2) - else: - print("UKNOWN | {}".format(summary)) - sys.exit(3) + critical_ct = {k: v for k, v in stats.items() if v > args.critical} + if critical_ct: + print("CRITICAL: %s | %s" % ( + get_ct_stats_message(critical_ct), + get_ct_perfdata_message(containers_stats))) + sys.exit(2) + + warning_ct = {k: v for k, v in stats.items() if v > args.warning} + if warning_ct: + print("WARNING: %s | %s" % ( + get_ct_stats_message(warning_ct), + get_ct_perfdata_message(containers_stats))) + sys.exit(1) + + print("OK | %s" % get_ct_perfdata_message(containers_stats)) + sys.exit(0) + if __name__ == '__main__': - main() + try: + main() + except SystemExit: + pass + except BaseException as exc: + EXC_TYPE, _, EXC_TRACEBACK = sys.exc_info() + FNAME = os.path.split(EXC_TRACEBACK.tb_frame.f_code.co_filename)[1] + print("UNKNOWN: %s Exception \"%s\" in %s line %s" % ( + EXC_TYPE.__name__, exc, FNAME, EXC_TRACEBACK.tb_lineno)) + sys.exit(3)