Skip to content

Commit

Permalink
Speed up docker_check with parallel check
Browse files Browse the repository at this point in the history
  • Loading branch information
rnt committed Jan 4, 2018
1 parent 8c8ca32 commit cab7a43
Showing 1 changed file with 139 additions and 72 deletions.
211 changes: 139 additions & 72 deletions docker_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,16 @@
import sys
import argparse

import logging
import queue
import threading

logging.basicConfig(
format='%(asctime)s level=%(levelname)-7s '
'threadName=%(threadName)s name=%(name)s %(message)s',
level=logging.INFO
)

try:
import docker
except ImportError as error:
Expand All @@ -19,34 +29,40 @@
__version__ = '0.1'


def get_mem_pct(container, stats):
def get_ct_stats(container):
'''Get container status'''
return container.stats(stream=False)


def get_mem_pct(stats):
'''Get a container memory usage in %'''
mem = stats[container]['memory_stats']
usage = mem['usage']
limit = mem['limit']
usage = stats['memory_stats']['usage']
limit = stats['memory_stats']['limit']
return round(usage * 100 / limit, 2)


def get_cpu_pct(container):
'''Get a container cpu usage in % via docker stats cmd'''
usage = str(
os.popen("docker stats --no-stream=true " + container).read()
).split()
usage_pct = usage[usage.index(container) + 1]
return float(usage_pct[:-1])
def get_cpu_pct(stats):
'''Get a container cpu usage in %'''
cpu_delta = stats['cpu_stats']['cpu_usage']['total_usage'] - \
stats['precpu_stats']['cpu_usage']['total_usage']
system_delta = stats['cpu_stats']['system_cpu_usage'] - \
stats['precpu_stats']['system_cpu_usage']
online_cpus = stats['cpu_stats']['online_cpus']
if cpu_delta > 0 and system_delta > 0:
return (cpu_delta / system_delta) * online_cpus * 100
return 0.0


def get_net_io(container, stats):
def get_net_io(stats):
'''Get a container Net In / Out usage since it's launche'''
net = stats[container]['networks']
net_in = net['eth0']['rx_bytes']
net_out = net['eth0']['tx_bytes']
return [net_in, net_out]
net_in = stats['networks']['eth0']['rx_bytes']
net_out = stats['networks']['eth0']['tx_bytes']
return net_in, net_out


def get_disk_io(container, stats):
def get_disk_io(stats):
'''Get a container Disk In / Out usage since it's launche'''
disk = stats[container]['blkio_stats']['io_service_bytes_recursive']
disk = stats['blkio_stats']['io_service_bytes_recursive']
try:
disk_in = disk[0]['value']
except IndexError:
Expand All @@ -58,9 +74,54 @@ def get_disk_io(container, stats):
return disk_in, disk_out


def get_ct_stats(container, client):
'''Get container status'''
return client.containers.get(container).stats(stream=False)
def get_ct_metrics(container_queue, containers_stats):
'''Get container metrics from docker API'''
logging.debug("Running get_ct_metrics()")
while not container_queue.empty():
container = container_queue.get()
logging.debug("Get container %s stats", container.id)
stats = get_ct_stats(container)

mem_pct = get_mem_pct(stats)
cpu_pct = get_cpu_pct(stats)
net_in, net_out = get_net_io(stats)
disk_in, disk_out = get_disk_io(stats)

containers_stats['%s_mem_pct' % container.id] = mem_pct
containers_stats['%s_cpu_pct' % container.id] = cpu_pct
containers_stats['%s_net_in' % container.id] = net_in
containers_stats['%s_net_out' % container.id] = net_out
containers_stats['%s_disk_in' % container.id] = disk_in
containers_stats['%s_disk_out' % container.id] = disk_out

container_queue.task_done()
logging.debug("Done with container %s stats", container.id)
logging.debug("End get_ct_metrics()")


def get_ct_stats_message(containers_stats):
'''Get check message from containers stats'''
return ', '.join(
[
"%s have %.2f%% %s" % (
k.split('_')[0][:12],
v,
k.split('_')[1])
for k, v
in containers_stats.items()
]
)


def get_ct_perfdata_message(containers_stats):
'''Get perfdata message from containers stats'''
return ' '.join(
[
"%s_%s=%s" % (k[:12], "_".join(k.split("_")[1:3]), v)
for k, v
in containers_stats.items()
]
)


def main():
Expand All @@ -74,65 +135,71 @@ def main():

# Try to use the lastest API version otherwise use
# the installed client API version
# Get list of running containers
try:
docker.from_env().containers.list()
containers_list = docker.from_env().containers.list()
client = docker.from_env()
except docker.errors.APIError as error:
version = re.sub('[^0-9.]+', '',
str(error).split('server API version:')[1])
client = docker.from_env(version=version)
# Get list of running containers
containers_list = client.containers.list()
containers = []
# If cid is True containers IDs will be used, otherwise names
cid = False
for i in containers_list:
cid = str(i).replace('<', '').replace('>', '').split()[1]
if cid:
containers.append(cid)
else:
containers.append(
os.popen("docker ps -f id=" + cid).read().split()[-1])
# Get stats and metrics
summary = ''
stats = {}
metrics = [0, 0]
ct_stats = {}
for i in containers:
ct_stats[i] = get_ct_stats(i, client)
mem_pct = get_mem_pct(i, ct_stats)
cpu_pct = get_cpu_pct(i)
net_in = get_net_io(i, ct_stats)[0]
net_out = get_net_io(i, ct_stats)[1]
disk_in = get_disk_io(i, ct_stats)[0]
disk_out = get_disk_io(i, ct_stats)[1]
stats[i + '_mem_pct'] = mem_pct
stats[i + '_cpu_pct'] = cpu_pct
summary += '{}_mem_pct={}% {}_cpu_pct={}% {}_net_in={} {}_net_out={} '\
'{}_disk_in={} {}_disk_out={} '.format(
i, mem_pct, i, cpu_pct, i, net_in, i, net_out, i,
disk_in, i, disk_out)
# Get the highest % use
for stat in stats:
if stats[stat] >= metrics[1]:
metrics[0] = stat
metrics[1] = stats[stat]
containers_list = client.containers.list()

logging.debug("containers_list = %s", containers_list)

# START
containers_queue = queue.Queue()
for container in containers_list:
containers_queue.put(container)

containers_stats = {}

# Set up some threads to fetch the enclosures
for th_id in range(len(containers_list)):
worker = threading.Thread(
target=get_ct_metrics,
args=(containers_queue, containers_stats,),
name='worker-{}'.format(th_id),
)
worker.setDaemon(True)
worker.start()

containers_queue.join()

logging.debug("containers_stats = %s", containers_stats)
stats = {
k: v
for k, v
in containers_stats.items()
if k.endswith('_mem_pct') or k.endswith('_cpu_pct')
}
logging.debug("stats = %s", stats)

# Check stats values and output perfdata
if metrics[1] < args.warning:
print("OK | {}".format(summary))
sys.exit(0)
elif args.warning <= metrics[1] <= args.critical:
print("WARNING: Some containers need your attention: {} have {}%'\
' | {}".format(metrics[0], metrics[1], summary))
sys.exit(1)
elif metrics[1] > 80:
print("CRITICAL: Some containers need your attention: {} have {}%'\
' | {}".format(metrics[0], metrics[1], summary))
critical_ct = {k: v for k, v in stats.items() if v > args.critical}
if critical_ct:
print("CRITICAL: %s | %s" % (
get_ct_stats_message(critical_ct),
get_ct_perfdata_message(containers_stats)))
sys.exit(2)
else:
print("UKNOWN | {}".format(summary))
sys.exit(3)

warning_ct = {k: v for k, v in stats.items() if v > args.warning}
if warning_ct:
print("WARNING: %s | %s" % (
get_ct_stats_message(warning_ct),
get_ct_perfdata_message(containers_stats)))
sys.exit(1)

print("OK | %s" % get_ct_perfdata_message(containers_stats))
sys.exit(0)


if __name__ == '__main__':
main()
try:
main()
except BaseException as exc:
EXC_TYPE, _, EXC_TRACEBACK = sys.exc_info()
FNAME = os.path.split(EXC_TRACEBACK.tb_frame.f_code.co_filename)[1]
print("UNKNOWN: %s Exception \"%s\" in %s line %s" % (
EXC_TYPE.__name__, exc, FNAME, EXC_TRACEBACK.tb_lineno))
sys.exit(3)

0 comments on commit cab7a43

Please sign in to comment.