From 52b3d71ab17b7f53beceab0196775fe4f5a2a68c Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Fri, 10 Aug 2018 17:18:15 +0200 Subject: [PATCH 001/648] initial commit --- drone.py | 33 ++++++++++++++++++++++++++ globals.py | 2 ++ job.py | 37 +++++++++++++++++++++++++++++ main.py | 20 ++++++++++++++++ pool.py | 67 ++++++++++++++++++++++++++++++++++++++++++++++++++++ scheduler.py | 23 ++++++++++++++++++ 6 files changed, 182 insertions(+) create mode 100644 drone.py create mode 100644 globals.py create mode 100644 job.py create mode 100644 main.py create mode 100644 pool.py create mode 100644 scheduler.py diff --git a/drone.py b/drone.py new file mode 100644 index 0000000..401bca8 --- /dev/null +++ b/drone.py @@ -0,0 +1,33 @@ +from job import job + + +class Drone(object): + def __init__(self, env, pool, scheduling_duration): + self.env = env + self.pool = pool + self.action = env.process(self.run(scheduling_duration)) + self._memory = 0 + self._disk = 0 + self._cores = 0 + + def run(self, scheduling_duration): + yield self.env.timeout(scheduling_duration) + print("drone is alive at", self.env.now) + self.pool.add_drone(self) + + def start_job(self, walltime, memory, cores, disk): + print("starting job at", self.env.now) + if (self._memory + memory > self.pool.memory or + self._disk + disk > self.pool.disk or + self._cores + cores > self.pool.cores): + # TODO: kill job + pass + self._memory += memory + self._disk += disk + self._cores += cores + self.env.process(job(self.env, walltime, memory, cores, disk)) + self._memory -= memory + self._disk -= disk + self._cores -= cores + # put drone back into pool queue + self.pool.add_drone(self) diff --git a/globals.py b/globals.py new file mode 100644 index 0000000..6f6b3e2 --- /dev/null +++ b/globals.py @@ -0,0 +1,2 @@ +pools = [] +global_demand = None diff --git a/job.py b/job.py new file mode 100644 index 0000000..b5b25b8 --- /dev/null +++ b/job.py @@ -0,0 +1,37 @@ +import random +import math +import globals + + +def job_demand(env): + """ + function randomly sets global user demand by using different strategies + :param env: + :return: + """ + while True: + delay = random.randint(0, 100) + strategy = random.random() + if strategy < 1/3: + # linear amount + print("strategy: linear amount") + amount = random.randint(0, int(random.random()*100)) + elif strategy < 2/3: + # exponential amount + print("strategy: exponential amount") + amount = (math.e**(random.random())-1)*random.random()*1000 + else: + # sqrt + print("strategy: sqrt amount") + amount = math.sqrt(random.random()*random.random()*100) + value = yield env.timeout(delay=delay, value=amount) + value = round(value) + if value > 0: + globals.global_demand.put(value) + print("[demand] raising user demand for %f at %d to %d" % (value, env.now, globals.global_demand.level)) + + +def job(env, walltime, memory, cores, disk): + globals.global_demand.get(1) + yield env.timeout(walltime) + print("job finished") diff --git a/main.py b/main.py new file mode 100644 index 0000000..d88168e --- /dev/null +++ b/main.py @@ -0,0 +1,20 @@ +import simpy +import globals +from job import job_demand +from scheduler import drone_scheduler, job_scheduler +from pool import Pool + + +def main(): + env = simpy.Environment() + for i in range(10): + globals.pools.append(Pool(env)) + globals.global_demand = simpy.Container(env) + env.process(job_demand(env)) + env.process(drone_scheduler(env)) + env.process(job_scheduler(env)) + env.run(until=100) + + +if __name__ == "__main__": + main() diff --git a/pool.py b/pool.py new file mode 100644 index 0000000..24aa36f --- /dev/null +++ b/pool.py @@ -0,0 +1,67 @@ +from simpy.resources import container +import globals +from drone import Drone + + +def pool_supply(): + result = 0 + for pool in globals.pools: + result += pool.supply + return result + + +def pool_demand(): + result = 0 + for pool in globals.pools: + result += pool.demand + return result + + +class Pool(container.Container): + def __init__(self, env, capacity=float('inf'), init=0, memory=8, cores=1, disk=100): + super(Pool, self).__init__(env, capacity, init) + self.memory = memory + self.cores = cores + self.disk = disk + self._demand = 0 + self._supply = 0 + self._drones = [] + self._drones_in_use = [] + self.env = env + self.action = env.process(self.run()) + + def run(self): + while True: + if self._supply < self._demand: + # start a new drone + self._supply += 1 + Drone(self.env, self, 10) + yield self.env.timeout(1) + + @property + def supply(self): + return self._supply + + @property + def demand(self): + return self._demand + + @demand.setter + def demand(self, value): + self._demand = value + + def add_drone(self, drone): + try: + self._drones_in_use.remove(drone) + except ValueError: + # drone not already existent + pass + self._drones.append(drone) + self.put(1) + print("[supply] pool supply at %d / %d (available: %d)" % (self._supply, self._demand, self.level)) + + def get_drone(self, amount): + super(Pool, self).get(amount) + drone = self._drones.pop(0) + self._drones_in_use.append(drone) + return drone diff --git a/scheduler.py b/scheduler.py new file mode 100644 index 0000000..af7742c --- /dev/null +++ b/scheduler.py @@ -0,0 +1,23 @@ +import globals +from pool import pool_demand + + +def drone_scheduler(env): + while True: + for pool in globals.pools: + demand = pool_demand() + if demand < globals.global_demand.level: + # ask for another drone in the pool + pool.demand += 1 + print("[demand] pool %f vs user %f" % (demand, globals.global_demand.level)) + yield env.timeout(0.1) + + +def job_scheduler(env): + while True: + for pool in globals.pools: + if pool.level > 0 and globals.global_demand.level > 0: + drone = pool.get_drone(1) + drone.start_job(walltime=10, memory=2, cores=1, disk=100) + yield env.timeout(.1) + From a4daeb80add38f5c338b8fe0f7c4e45db13d5031 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Fri, 10 Aug 2018 17:52:53 +0200 Subject: [PATCH 002/648] simulation is now also capable to lower demands --- drone.py | 4 ++++ main.py | 2 +- pool.py | 6 ++++++ scheduler.py | 9 ++++++--- 4 files changed, 17 insertions(+), 4 deletions(-) diff --git a/drone.py b/drone.py index 401bca8..dea34e2 100644 --- a/drone.py +++ b/drone.py @@ -15,6 +15,10 @@ def run(self, scheduling_duration): print("drone is alive at", self.env.now) self.pool.add_drone(self) + def shutdown(self): + yield self.env.timeout(1) + print("drone has been shut down") + def start_job(self, walltime, memory, cores, disk): print("starting job at", self.env.now) if (self._memory + memory > self.pool.memory or diff --git a/main.py b/main.py index d88168e..8076bd3 100644 --- a/main.py +++ b/main.py @@ -13,7 +13,7 @@ def main(): env.process(job_demand(env)) env.process(drone_scheduler(env)) env.process(job_scheduler(env)) - env.run(until=100) + env.run(until=1000) if __name__ == "__main__": diff --git a/pool.py b/pool.py index 24aa36f..2a16698 100644 --- a/pool.py +++ b/pool.py @@ -36,6 +36,12 @@ def run(self): # start a new drone self._supply += 1 Drone(self.env, self, 10) + elif self._supply > self._demand: + self.get(1) + drone = self._drones.pop(0) + self._supply -= 1 + yield from drone.shutdown() + del drone yield self.env.timeout(1) @property diff --git a/scheduler.py b/scheduler.py index af7742c..5d042c1 100644 --- a/scheduler.py +++ b/scheduler.py @@ -9,8 +9,11 @@ def drone_scheduler(env): if demand < globals.global_demand.level: # ask for another drone in the pool pool.demand += 1 - print("[demand] pool %f vs user %f" % (demand, globals.global_demand.level)) - yield env.timeout(0.1) + elif demand > globals.global_demand.level and pool.demand > 0: + # lower demand and ask for stopping drones + pool.demand -= 1 + print("[demand] pool %f vs user %f" % (demand, globals.global_demand.level)) + yield env.timeout(1) def job_scheduler(env): @@ -19,5 +22,5 @@ def job_scheduler(env): if pool.level > 0 and globals.global_demand.level > 0: drone = pool.get_drone(1) drone.start_job(walltime=10, memory=2, cores=1, disk=100) - yield env.timeout(.1) + yield env.timeout(1) From 1182f83c310a8ee1caf76c35ea46cca1af6d8a4d Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Fri, 10 Aug 2018 19:00:26 +0200 Subject: [PATCH 003/648] fixed bug and added yield --- drone.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drone.py b/drone.py index dea34e2..d179314 100644 --- a/drone.py +++ b/drone.py @@ -29,7 +29,7 @@ def start_job(self, walltime, memory, cores, disk): self._memory += memory self._disk += disk self._cores += cores - self.env.process(job(self.env, walltime, memory, cores, disk)) + yield self.env.process(job(self.env, walltime, memory, cores, disk)) self._memory -= memory self._disk -= disk self._cores -= cores From 8b1e7e0b279d84585a0fb2e403b6fb9860e42bf4 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Fri, 10 Aug 2018 19:01:28 +0200 Subject: [PATCH 004/648] added generator for job properties --- globals.py | 1 + job.py | 5 +++++ main.py | 3 ++- scheduler.py | 2 +- 4 files changed, 9 insertions(+), 2 deletions(-) diff --git a/globals.py b/globals.py index 6f6b3e2..f628722 100644 --- a/globals.py +++ b/globals.py @@ -1,2 +1,3 @@ pools = [] global_demand = None +job_generator = None diff --git a/job.py b/job.py index b5b25b8..e12e80f 100644 --- a/job.py +++ b/job.py @@ -35,3 +35,8 @@ def job(env, walltime, memory, cores, disk): globals.global_demand.get(1) yield env.timeout(walltime) print("job finished") + + +def job_property_generator(): + while True: + yield 10, 2, 1, 100 diff --git a/main.py b/main.py index 8076bd3..4827c39 100644 --- a/main.py +++ b/main.py @@ -1,6 +1,6 @@ import simpy import globals -from job import job_demand +from job import job_demand, job_property_generator from scheduler import drone_scheduler, job_scheduler from pool import Pool @@ -10,6 +10,7 @@ def main(): for i in range(10): globals.pools.append(Pool(env)) globals.global_demand = simpy.Container(env) + globals.job_generator = job_property_generator() env.process(job_demand(env)) env.process(drone_scheduler(env)) env.process(job_scheduler(env)) diff --git a/scheduler.py b/scheduler.py index 5d042c1..a4a3b2f 100644 --- a/scheduler.py +++ b/scheduler.py @@ -21,6 +21,6 @@ def job_scheduler(env): for pool in globals.pools: if pool.level > 0 and globals.global_demand.level > 0: drone = pool.get_drone(1) - drone.start_job(walltime=10, memory=2, cores=1, disk=100) + drone.start_job(*next(globals.job_generator)) yield env.timeout(1) From 83c75fe85465d6e5ac5abb8bee9bcb8a2b3e86b7 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Fri, 10 Aug 2018 19:01:49 +0200 Subject: [PATCH 005/648] made pool conform to cobald --- pool.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/pool.py b/pool.py index 2a16698..4a668bd 100644 --- a/pool.py +++ b/pool.py @@ -1,4 +1,5 @@ from simpy.resources import container +from cobald import interfaces import globals from drone import Drone @@ -17,7 +18,7 @@ def pool_demand(): return result -class Pool(container.Container): +class Pool(interfaces.Pool, container.Container): def __init__(self, env, capacity=float('inf'), init=0, memory=8, cores=1, disk=100): super(Pool, self).__init__(env, capacity, init) self.memory = memory @@ -44,6 +45,14 @@ def run(self): del drone yield self.env.timeout(1) + @property + def allocation(self) -> float: + return len(self._drones_in_use) / self._supply + + @property + def utilisation(self) -> float: + return 0 + @property def supply(self): return self._supply @@ -64,7 +73,8 @@ def add_drone(self, drone): pass self._drones.append(drone) self.put(1) - print("[supply] pool supply at %d / %d (available: %d)" % (self._supply, self._demand, self.level)) + print("[supply] pool supply at %d / %d (available: %d, allocation: %.2f, utilisation: %.2f)" + % (self._supply, self._demand, self.level, self.allocation, self.utilisation)) def get_drone(self, amount): super(Pool, self).get(amount) From a7f4a035e7899783c3cbaa7ec7a136d0bbaf60f6 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Tue, 14 Aug 2018 21:31:03 +0200 Subject: [PATCH 006/648] drone now also implements pool interface --- drone.py | 37 ++++++++++++++++++++++++++++++++----- 1 file changed, 32 insertions(+), 5 deletions(-) diff --git a/drone.py b/drone.py index d179314..0d5c414 100644 --- a/drone.py +++ b/drone.py @@ -1,26 +1,51 @@ +from cobald import interfaces + from job import job -class Drone(object): +class Drone(interfaces.Pool): def __init__(self, env, pool, scheduling_duration): + super(Drone, self).__init__() self.env = env self.pool = pool self.action = env.process(self.run(scheduling_duration)) self._memory = 0 self._disk = 0 self._cores = 0 + self._supply = 0 def run(self, scheduling_duration): yield self.env.timeout(scheduling_duration) - print("drone is alive at", self.env.now) - self.pool.add_drone(self) + self._supply = 1 + yield from self.pool.drone_ready(self) + + @property + def supply(self): + return self._supply + + @property + def demand(self): + return 1 + + @demand.setter + def demand(self, value): + pass # demand is always defined as 1 + + @property + def utilisation(self): + return ((self._memory / self.pool.memory) + (self._disk / self.pool.disk) + (self._cores / self.pool.cores)) / 3 + + @property + def allocation(self): + return self._memory > 0 or self._disk > 0 or self._cores > 0 def shutdown(self): + self._supply = 0 yield self.env.timeout(1) - print("drone has been shut down") + print("[drone %s] has been shut down" % self) def start_job(self, walltime, memory, cores, disk): - print("starting job at", self.env.now) + print("[drone %s] starting job at %d" % (self, self.env.now)) if (self._memory + memory > self.pool.memory or self._disk + disk > self.pool.disk or self._cores + cores > self.pool.cores): @@ -34,4 +59,6 @@ def start_job(self, walltime, memory, cores, disk): self._disk -= disk self._cores -= cores # put drone back into pool queue + print("[drone %s] finished job at %d" % (self, self.env.now)) self.pool.add_drone(self) + yield from self.pool.drone_ready(self) From fc8d424c3654362fffd7b90183efe8d2e080796e Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Tue, 14 Aug 2018 21:32:05 +0200 Subject: [PATCH 007/648] utilisation of linearcontroller from cobald --- controller.py | 19 ++++++++++++++ job.py | 5 ++-- main.py | 10 +++++--- pool.py | 71 +++++++++++++++++++++++++++++---------------------- scheduler.py | 25 ++++-------------- 5 files changed, 73 insertions(+), 57 deletions(-) create mode 100644 controller.py diff --git a/controller.py b/controller.py new file mode 100644 index 0000000..f37f070 --- /dev/null +++ b/controller.py @@ -0,0 +1,19 @@ +from cobald.controller.linear import LinearController +from cobald.interfaces import Pool + +import globals + + +class SimulatedLinearController(LinearController): + def __init__(self, env, target: Pool, low_utilisation=0.5, high_allocation=0.5, rate=1): + super(SimulatedLinearController, self).__init__(target, low_utilisation, high_allocation, rate) + self.env = env + self.action = env.process(self.run()) + + def run(self): + while True: + print("[controller] demand %d, supply %d (global %d), allocation %.2f, utilisation %.2f (available %d)" % ( + self.target.demand, self.target.supply, globals.global_demand.level, self.target.allocation, + self.target.utilisation, self.target.level)) + self.regulate_demand() + yield self.env.timeout(self._interval) diff --git a/job.py b/job.py index e12e80f..67ea218 100644 --- a/job.py +++ b/job.py @@ -32,11 +32,12 @@ def job_demand(env): def job(env, walltime, memory, cores, disk): + print("starting job at", env.now) globals.global_demand.get(1) yield env.timeout(walltime) - print("job finished") + print("job finished", env.now) def job_property_generator(): while True: - yield 10, 2, 1, 100 + yield 10, 8, 1, 100 diff --git a/main.py b/main.py index 4827c39..9fef68f 100644 --- a/main.py +++ b/main.py @@ -1,18 +1,20 @@ import simpy import globals from job import job_demand, job_property_generator -from scheduler import drone_scheduler, job_scheduler +from scheduler import job_scheduler from pool import Pool +from controller import SimulatedLinearController def main(): env = simpy.Environment() + globals.job_generator = job_property_generator() for i in range(10): - globals.pools.append(Pool(env)) + pool = Pool(env) + globals.pools.append(pool) + SimulatedLinearController(env, target=pool, rate=1) globals.global_demand = simpy.Container(env) - globals.job_generator = job_property_generator() env.process(job_demand(env)) - env.process(drone_scheduler(env)) env.process(job_scheduler(env)) env.run(until=1000) diff --git a/pool.py b/pool.py index 4a668bd..e90ffd0 100644 --- a/pool.py +++ b/pool.py @@ -1,23 +1,8 @@ from simpy.resources import container from cobald import interfaces -import globals from drone import Drone -def pool_supply(): - result = 0 - for pool in globals.pools: - result += pool.supply - return result - - -def pool_demand(): - result = 0 - for pool in globals.pools: - result += pool.demand - return result - - class Pool(interfaces.Pool, container.Container): def __init__(self, env, capacity=float('inf'), init=0, memory=8, cores=1, disk=100): super(Pool, self).__init__(env, capacity, init) @@ -25,7 +10,6 @@ def __init__(self, env, capacity=float('inf'), init=0, memory=8, cores=1, disk=1 self.cores = cores self.disk = disk self._demand = 0 - self._supply = 0 self._drones = [] self._drones_in_use = [] self.env = env @@ -33,29 +17,45 @@ def __init__(self, env, capacity=float('inf'), init=0, memory=8, cores=1, disk=1 def run(self): while True: - if self._supply < self._demand: + if self.drone_demand() < self._demand: # start a new drone - self._supply += 1 - Drone(self.env, self, 10) - elif self._supply > self._demand: - self.get(1) + self.add_drone(Drone(self.env, self, 10)) + elif self.drone_demand() > self._demand: + yield self.get(1) drone = self._drones.pop(0) - self._supply -= 1 yield from drone.shutdown() del drone yield self.env.timeout(1) + def drone_demand(self): + return len(self._drones) + len(self._drones_in_use) + @property def allocation(self) -> float: - return len(self._drones_in_use) / self._supply + allocations = [] + for drone in self.drones(): + allocations.append(drone.allocation) + try: + return sum(allocations) / len(allocations) + except ZeroDivisionError: + return 1 @property def utilisation(self) -> float: - return 0 + utilisations = [] + for drone in self._drones_in_use: + utilisations.append(drone.utilisation) + try: + return sum(utilisations) / len(utilisations) + except ZeroDivisionError: + return 1 @property def supply(self): - return self._supply + supply = 0 + for drone in self.drones(): + supply += drone.supply + return supply @property def demand(self): @@ -63,21 +63,30 @@ def demand(self): @demand.setter def demand(self, value): - self._demand = value + if value > 0: + self._demand = value + else: + self._demand = 0 def add_drone(self, drone): try: self._drones_in_use.remove(drone) except ValueError: - # drone not already existent pass + + def drone_ready(self, drone): + print("[drone %s] is ready at %d" % (drone, self.env.now)) self._drones.append(drone) - self.put(1) - print("[supply] pool supply at %d / %d (available: %d, allocation: %.2f, utilisation: %.2f)" - % (self._supply, self._demand, self.level, self.allocation, self.utilisation)) + yield self.put(1) def get_drone(self, amount): - super(Pool, self).get(amount) + yield self.get(amount) drone = self._drones.pop(0) self._drones_in_use.append(drone) return drone + + def drones(self): + for drone in self._drones: + yield drone + for drone in self._drones_in_use: + yield drone diff --git a/scheduler.py b/scheduler.py index a4a3b2f..30d574c 100644 --- a/scheduler.py +++ b/scheduler.py @@ -1,26 +1,11 @@ import globals -from pool import pool_demand - - -def drone_scheduler(env): - while True: - for pool in globals.pools: - demand = pool_demand() - if demand < globals.global_demand.level: - # ask for another drone in the pool - pool.demand += 1 - elif demand > globals.global_demand.level and pool.demand > 0: - # lower demand and ask for stopping drones - pool.demand -= 1 - print("[demand] pool %f vs user %f" % (demand, globals.global_demand.level)) - yield env.timeout(1) def job_scheduler(env): while True: for pool in globals.pools: - if pool.level > 0 and globals.global_demand.level > 0: - drone = pool.get_drone(1) - drone.start_job(*next(globals.job_generator)) - yield env.timeout(1) - + while pool.level > 0 and globals.global_demand.level > 0: + drone = yield from pool.get_drone(1) + env.process(drone.start_job(*next(globals.job_generator))) + yield env.timeout(0) + yield env.timeout(1) From 194cd5f77585262b003aa648acd02a774ef43e51 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Thu, 16 Aug 2018 21:42:03 +0200 Subject: [PATCH 008/648] added some first plotted data to simulation --- controller.py | 19 ++++++++++++++++--- globals.py | 4 ++++ job.py | 11 ++++++----- main.py | 38 ++++++++++++++++++++++++++++++++++++++ pool.py | 38 ++++++++++++++++++++++++++++++++++++-- 5 files changed, 100 insertions(+), 10 deletions(-) diff --git a/controller.py b/controller.py index f37f070..74ef6f7 100644 --- a/controller.py +++ b/controller.py @@ -3,6 +3,9 @@ import globals +from cost import cobald_cost +from pool import pool_demands, pool_allocation, pool_utilisation, pool_unused + class SimulatedLinearController(LinearController): def __init__(self, env, target: Pool, low_utilisation=0.5, high_allocation=0.5, rate=1): @@ -12,8 +15,18 @@ def __init__(self, env, target: Pool, low_utilisation=0.5, high_allocation=0.5, def run(self): while True: - print("[controller] demand %d, supply %d (global %d), allocation %.2f, utilisation %.2f (available %d)" % ( - self.target.demand, self.target.supply, globals.global_demand.level, self.target.allocation, - self.target.utilisation, self.target.level)) + pre_demand = self.target.demand self.regulate_demand() + # print("[controller] demand %d -> %d, supply %d (global %d), allocation %.2f, utilisation %.2f " + # "(available %d)" % (pre_demand, self.target.demand, self.target.supply, globals.global_demand.level, + # self.target.allocation, self.target.utilisation, self.target.level)) + globals.monitoring_data[round(self.env.now)]["user_demand"] = globals.global_demand.level + globals.monitoring_data[round(self.env.now)]["pool_demand"] = pool_demands() + globals.monitoring_data[round(self.env.now)]["pool_utilisation"] = pool_utilisation() + globals.monitoring_data[round(self.env.now)]["pool_allocation"] = pool_allocation() + globals.monitoring_data[round(self.env.now)]["pool_unused"] = pool_unused() * -1 + current_cost = cobald_cost() + globals.cost += current_cost + globals.monitoring_data[round(self.env.now)]["cost"] = current_cost + globals.monitoring_data[round(self.env.now)]["acc_cost"] =globals.cost yield self.env.timeout(self._interval) diff --git a/globals.py b/globals.py index f628722..572daba 100644 --- a/globals.py +++ b/globals.py @@ -1,3 +1,7 @@ +from collections import defaultdict + pools = [] global_demand = None job_generator = None +monitoring_data = defaultdict(dict) # {tme: {variable: value, ...}} +cost = 0 diff --git a/job.py b/job.py index 67ea218..6cdf87f 100644 --- a/job.py +++ b/job.py @@ -14,28 +14,29 @@ def job_demand(env): strategy = random.random() if strategy < 1/3: # linear amount - print("strategy: linear amount") + # print("strategy: linear amount") amount = random.randint(0, int(random.random()*100)) elif strategy < 2/3: # exponential amount - print("strategy: exponential amount") + # print("strategy: exponential amount") amount = (math.e**(random.random())-1)*random.random()*1000 else: # sqrt - print("strategy: sqrt amount") + # print("strategy: sqrt amount") amount = math.sqrt(random.random()*random.random()*100) value = yield env.timeout(delay=delay, value=amount) value = round(value) if value > 0: globals.global_demand.put(value) - print("[demand] raising user demand for %f at %d to %d" % (value, env.now, globals.global_demand.level)) + globals.monitoring_data[round(env.now)]["user_demand_new"] = value + # print("[demand] raising user demand for %f at %d to %d" % (value, env.now, globals.global_demand.level)) def job(env, walltime, memory, cores, disk): print("starting job at", env.now) globals.global_demand.get(1) yield env.timeout(walltime) - print("job finished", env.now) + # print("job finished", env.now) def job_property_generator(): diff --git a/main.py b/main.py index 9fef68f..41c0bab 100644 --- a/main.py +++ b/main.py @@ -1,4 +1,8 @@ import simpy +import random + +import matplotlib.pyplot as plt + import globals from job import job_demand, job_property_generator from scheduler import job_scheduler @@ -7,6 +11,7 @@ def main(): + random.seed(1234) env = simpy.Environment() globals.job_generator = job_property_generator() for i in range(10): @@ -18,6 +23,39 @@ def main(): env.process(job_scheduler(env)) env.run(until=1000) + # Plotting some first results + plt.plot(globals.monitoring_data.keys(), [value.get("user_demand", None) for value in globals.monitoring_data.values()]) + plt.plot(globals.monitoring_data.keys(), + [value.get("user_demand_new", None) for value in globals.monitoring_data.values()], + 'ro') + plt.plot(globals.monitoring_data.keys(), + [value.get("pool_demand", None) for value in globals.monitoring_data.values()]) + plt.show() + plt.plot(globals.monitoring_data.keys(), + [value.get("pool_utilisation", None) for value in globals.monitoring_data.values()]) + plt.plot(globals.monitoring_data.keys(), + [value.get("pool_allocation", None) for value in globals.monitoring_data.values()]) + plt.plot(globals.monitoring_data.keys(), + [value.get("pool_unused", None) for value in globals.monitoring_data.values()]) + plt.show() + + fig, ax1 = plt.subplots() + ax1.plot(globals.monitoring_data.keys(), + [value.get("cost", None) for value in globals.monitoring_data.values()], 'b-') + ax1.set_xlabel('Time') + # Make the y-axis label, ticks and tick labels match the line color. + ax1.set_ylabel('Cost', color='b') + ax1.tick_params('y', colors='b') + + ax2 = ax1.twinx() + ax2.plot(globals.monitoring_data.keys(), + [value.get("acc_cost", None) for value in globals.monitoring_data.values()], 'r.') + ax2.set_ylabel('Accumulated Cost', color='r') + ax2.tick_params('y', colors='r') + + fig.tight_layout() + plt.show() + if __name__ == "__main__": main() diff --git a/pool.py b/pool.py index e90ffd0..3220165 100644 --- a/pool.py +++ b/pool.py @@ -1,8 +1,42 @@ from simpy.resources import container from cobald import interfaces + +import globals from drone import Drone +def pool_demands(): + result = 0 + for pool in globals.pools: + result += pool.demand + return result + + +def pool_utilisation(): + result = [] + for pool in globals.pools: + for drone in pool.drones(): + result.append(drone.utilisation) + return sum(result) + + +def pool_allocation(): + result = [] + for pool in globals.pools: + for drone in pool.drones(): + result.append(drone.allocation) + return sum(result) + + +def pool_unused(): + result = 0 + for pool in globals.pools: + for drone in pool.drones(): + if drone.allocation == 0: + result += 1 + return result + + class Pool(interfaces.Pool, container.Container): def __init__(self, env, capacity=float('inf'), init=0, memory=8, cores=1, disk=100): super(Pool, self).__init__(env, capacity, init) @@ -43,7 +77,7 @@ def allocation(self) -> float: @property def utilisation(self) -> float: utilisations = [] - for drone in self._drones_in_use: + for drone in self.drones(): utilisations.append(drone.utilisation) try: return sum(utilisations) / len(utilisations) @@ -75,7 +109,7 @@ def add_drone(self, drone): pass def drone_ready(self, drone): - print("[drone %s] is ready at %d" % (drone, self.env.now)) + # print("[drone %s] is ready at %d" % (drone, self.env.now)) self._drones.append(drone) yield self.put(1) From bfa29670d8890bd3f6b558055e09de726c38331a Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Thu, 16 Aug 2018 21:42:29 +0200 Subject: [PATCH 009/648] generalised resources for pools and drones --- cost.py | 14 ++++++++++++++ drone.py | 42 +++++++++++++++++++++--------------------- job.py | 6 +++--- pool.py | 6 ++---- 4 files changed, 40 insertions(+), 28 deletions(-) create mode 100644 cost.py diff --git a/cost.py b/cost.py new file mode 100644 index 0000000..b6aa952 --- /dev/null +++ b/cost.py @@ -0,0 +1,14 @@ +import globals + + +def cobald_cost(): + result = globals.global_demand.level + for pool in globals.pools: + for drone in pool.drones(): + result += 1 + tmp = 0 + for resource_key in pool.resources: + tmp += drone.resources[resource_key] / pool.resources[resource_key] + tmp /= len(pool.resources) + result -= tmp + return result diff --git a/drone.py b/drone.py index 0d5c414..cb978ec 100644 --- a/drone.py +++ b/drone.py @@ -9,9 +9,7 @@ def __init__(self, env, pool, scheduling_duration): self.env = env self.pool = pool self.action = env.process(self.run(scheduling_duration)) - self._memory = 0 - self._disk = 0 - self._cores = 0 + self.resources = {resource: 0 for resource in self.pool.resources} self._supply = 0 def run(self, scheduling_duration): @@ -33,32 +31,34 @@ def demand(self, value): @property def utilisation(self): - return ((self._memory / self.pool.memory) + (self._disk / self.pool.disk) + (self._cores / self.pool.cores)) / 3 + result = 0 + for resource in self.resources: + result += self.resources[resource] / self.pool.resources[resource] + return result / len(self.resources) + #return min((self._memory / self.pool.memory), (self._disk / self.pool.disk), (self._cores / self.pool.cores)) @property def allocation(self): - return self._memory > 0 or self._disk > 0 or self._cores > 0 + return sum(self.resources.values()) > 0 + #return max((self._memory / self.pool.memory), (self._disk / self.pool.disk), (self._cores / self.pool.cores)) def shutdown(self): self._supply = 0 yield self.env.timeout(1) - print("[drone %s] has been shut down" % self) + # print("[drone %s] has been shut down" % self) - def start_job(self, walltime, memory, cores, disk): - print("[drone %s] starting job at %d" % (self, self.env.now)) - if (self._memory + memory > self.pool.memory or - self._disk + disk > self.pool.disk or - self._cores + cores > self.pool.cores): - # TODO: kill job - pass - self._memory += memory - self._disk += disk - self._cores += cores - yield self.env.process(job(self.env, walltime, memory, cores, disk)) - self._memory -= memory - self._disk -= disk - self._cores -= cores + def start_job(self, walltime, resources): + # print("[drone %s] starting job at %d" % (self, self.env.now)) + for resource_key in resources: + if self.resources[resource_key] + resources[resource_key]: + # TODO: kill job + pass + for resource_key in resources: + self.resources[resource_key] += resources[resource_key] + yield self.env.process(job(self.env, walltime, resources)) + for resource_key in resources: + self.resources[resource_key] -= resources[resource_key] # put drone back into pool queue - print("[drone %s] finished job at %d" % (self, self.env.now)) + # print("[drone %s] finished job at %d" % (self, self.env.now)) self.pool.add_drone(self) yield from self.pool.drone_ready(self) diff --git a/job.py b/job.py index 6cdf87f..7054ad6 100644 --- a/job.py +++ b/job.py @@ -32,8 +32,8 @@ def job_demand(env): # print("[demand] raising user demand for %f at %d to %d" % (value, env.now, globals.global_demand.level)) -def job(env, walltime, memory, cores, disk): - print("starting job at", env.now) +def job(env, walltime, resources): + # print("starting job at", env.now) globals.global_demand.get(1) yield env.timeout(walltime) # print("job finished", env.now) @@ -41,4 +41,4 @@ def job(env, walltime, memory, cores, disk): def job_property_generator(): while True: - yield 10, 8, 1, 100 + yield 10, {"memory": 8, "cores": 1, "disk": 100} diff --git a/pool.py b/pool.py index 3220165..5ae6d3d 100644 --- a/pool.py +++ b/pool.py @@ -38,11 +38,9 @@ def pool_unused(): class Pool(interfaces.Pool, container.Container): - def __init__(self, env, capacity=float('inf'), init=0, memory=8, cores=1, disk=100): + def __init__(self, env, capacity=float('inf'), init=0, resources={"memory": 8, "cores": 1, "disk": 100}): super(Pool, self).__init__(env, capacity, init) - self.memory = memory - self.cores = cores - self.disk = disk + self.resources = resources self._demand = 0 self._drones = [] self._drones_in_use = [] From 350945260a367d5a4a0b1392b702fb8436746415 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Thu, 16 Aug 2018 22:19:18 +0200 Subject: [PATCH 010/648] added logging for each time step into main --- controller.py | 15 --------------- main.py | 40 +++++++++++++++++++++++++++++++++++++++- 2 files changed, 39 insertions(+), 16 deletions(-) diff --git a/controller.py b/controller.py index 74ef6f7..51e09b4 100644 --- a/controller.py +++ b/controller.py @@ -1,11 +1,6 @@ from cobald.controller.linear import LinearController from cobald.interfaces import Pool -import globals - -from cost import cobald_cost -from pool import pool_demands, pool_allocation, pool_utilisation, pool_unused - class SimulatedLinearController(LinearController): def __init__(self, env, target: Pool, low_utilisation=0.5, high_allocation=0.5, rate=1): @@ -15,18 +10,8 @@ def __init__(self, env, target: Pool, low_utilisation=0.5, high_allocation=0.5, def run(self): while True: - pre_demand = self.target.demand self.regulate_demand() # print("[controller] demand %d -> %d, supply %d (global %d), allocation %.2f, utilisation %.2f " # "(available %d)" % (pre_demand, self.target.demand, self.target.supply, globals.global_demand.level, # self.target.allocation, self.target.utilisation, self.target.level)) - globals.monitoring_data[round(self.env.now)]["user_demand"] = globals.global_demand.level - globals.monitoring_data[round(self.env.now)]["pool_demand"] = pool_demands() - globals.monitoring_data[round(self.env.now)]["pool_utilisation"] = pool_utilisation() - globals.monitoring_data[round(self.env.now)]["pool_allocation"] = pool_allocation() - globals.monitoring_data[round(self.env.now)]["pool_unused"] = pool_unused() * -1 - current_cost = cobald_cost() - globals.cost += current_cost - globals.monitoring_data[round(self.env.now)]["cost"] = current_cost - globals.monitoring_data[round(self.env.now)]["acc_cost"] =globals.cost yield self.env.timeout(self._interval) diff --git a/main.py b/main.py index 41c0bab..e099bac 100644 --- a/main.py +++ b/main.py @@ -1,18 +1,56 @@ +from functools import partial, wraps import simpy import random import matplotlib.pyplot as plt import globals +from cost import cobald_cost from job import job_demand, job_property_generator from scheduler import job_scheduler -from pool import Pool +from pool import Pool, pool_demands, pool_utilisation, pool_allocation, pool_unused from controller import SimulatedLinearController +def trace(env, callback): + def get_wrapper(env_step, callback): + @wraps(env_step) + def tracing_step(): + if len(env._queue): + t, prio, eid, event = env._queue[0] + callback(t, prio, eid, event) + return env_step() + return tracing_step + env.step = get_wrapper(env.step, callback) + + +last_step = 0 + + +def monitor(data, t, prio, eid, event): + global last_step + # TODO: do relevant monitoring + if t > last_step: + # new data to be recorded + tmp = round(t) + data[tmp]["user_demand"] = globals.global_demand.level + data[tmp]["pool_demand"] = pool_demands() + data[tmp]["pool_utilisation"] = pool_utilisation() + data[tmp]["pool_allocation"] = pool_allocation() + data[tmp]["pool_unused"] = pool_unused() * -1 + current_cost = cobald_cost() + data[tmp]["cost"] = current_cost + globals.cost += current_cost + data[tmp]["acc_cost"] = globals.cost + last_step = tmp + + def main(): + monitor_data = partial(monitor, globals.monitoring_data) + random.seed(1234) env = simpy.Environment() + trace(env, monitor_data) globals.job_generator = job_property_generator() for i in range(10): pool = Pool(env) From 62abb6b53e1c8becb85fdc505ef59c7852761a38 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Fri, 17 Aug 2018 15:45:07 +0200 Subject: [PATCH 011/648] included scheduling that might mimic condor --- cost.py | 2 +- drone.py | 7 ++---- job.py | 16 +++++++++----- main.py | 7 ++++-- pool.py | 41 ++++++++++------------------------ scheduler.py | 62 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 92 insertions(+), 43 deletions(-) diff --git a/cost.py b/cost.py index b6aa952..f321c97 100644 --- a/cost.py +++ b/cost.py @@ -4,7 +4,7 @@ def cobald_cost(): result = globals.global_demand.level for pool in globals.pools: - for drone in pool.drones(): + for drone in pool.drones: result += 1 tmp = 0 for resource_key in pool.resources: diff --git a/drone.py b/drone.py index cb978ec..dcb3e39 100644 --- a/drone.py +++ b/drone.py @@ -1,6 +1,6 @@ from cobald import interfaces -from job import job +from job import Job class Drone(interfaces.Pool): @@ -48,17 +48,14 @@ def shutdown(self): # print("[drone %s] has been shut down" % self) def start_job(self, walltime, resources): - # print("[drone %s] starting job at %d" % (self, self.env.now)) for resource_key in resources: if self.resources[resource_key] + resources[resource_key]: # TODO: kill job pass for resource_key in resources: self.resources[resource_key] += resources[resource_key] - yield self.env.process(job(self.env, walltime, resources)) + yield from Job(self.env, walltime, resources) for resource_key in resources: self.resources[resource_key] -= resources[resource_key] # put drone back into pool queue # print("[drone %s] finished job at %d" % (self, self.env.now)) - self.pool.add_drone(self) - yield from self.pool.drone_ready(self) diff --git a/job.py b/job.py index 7054ad6..62439d9 100644 --- a/job.py +++ b/job.py @@ -32,11 +32,17 @@ def job_demand(env): # print("[demand] raising user demand for %f at %d to %d" % (value, env.now, globals.global_demand.level)) -def job(env, walltime, resources): - # print("starting job at", env.now) - globals.global_demand.get(1) - yield env.timeout(walltime) - # print("job finished", env.now) +class Job(object): + def __init__(self, env, walltime, resources): + self.env = env + self.resources = resources + self.walltime = walltime + + def __iter__(self): + # print("starting job at", self.env.now) + yield globals.global_demand.get(1) + yield self.env.timeout(self.walltime) + # print("job finished", self.env.now) def job_property_generator(): diff --git a/main.py b/main.py index e099bac..ea3b982 100644 --- a/main.py +++ b/main.py @@ -7,7 +7,7 @@ import globals from cost import cobald_cost from job import job_demand, job_property_generator -from scheduler import job_scheduler +from scheduler import job_scheduler, htcondor_job_scheduler from pool import Pool, pool_demands, pool_utilisation, pool_allocation, pool_unused from controller import SimulatedLinearController @@ -43,6 +43,9 @@ def monitor(data, t, prio, eid, event): globals.cost += current_cost data[tmp]["acc_cost"] = globals.cost last_step = tmp + # for pool in globals.pools: + # print("%s [Pool %s] drones %d, demand %d, supply %d (%d); allocation %.2f, utilisation %.2f" % ( + # tmp, pool, len(pool.drones), pool.demand, pool.supply, pool.level, pool.allocation, pool.utilisation)) def main(): @@ -58,7 +61,7 @@ def main(): SimulatedLinearController(env, target=pool, rate=1) globals.global_demand = simpy.Container(env) env.process(job_demand(env)) - env.process(job_scheduler(env)) + env.process(htcondor_job_scheduler(env)) env.run(until=1000) # Plotting some first results diff --git a/pool.py b/pool.py index 5ae6d3d..65052a7 100644 --- a/pool.py +++ b/pool.py @@ -15,7 +15,7 @@ def pool_demands(): def pool_utilisation(): result = [] for pool in globals.pools: - for drone in pool.drones(): + for drone in pool.drones: result.append(drone.utilisation) return sum(result) @@ -23,7 +23,7 @@ def pool_utilisation(): def pool_allocation(): result = [] for pool in globals.pools: - for drone in pool.drones(): + for drone in pool.drones: result.append(drone.allocation) return sum(result) @@ -31,7 +31,7 @@ def pool_allocation(): def pool_unused(): result = 0 for pool in globals.pools: - for drone in pool.drones(): + for drone in pool.drones: if drone.allocation == 0: result += 1 return result @@ -42,8 +42,7 @@ def __init__(self, env, capacity=float('inf'), init=0, resources={"memory": 8, " super(Pool, self).__init__(env, capacity, init) self.resources = resources self._demand = 0 - self._drones = [] - self._drones_in_use = [] + self.drones = [] self.env = env self.action = env.process(self.run()) @@ -51,21 +50,21 @@ def run(self): while True: if self.drone_demand() < self._demand: # start a new drone - self.add_drone(Drone(self.env, self, 10)) + Drone(self.env, self, 10) elif self.drone_demand() > self._demand: yield self.get(1) - drone = self._drones.pop(0) + drone = self.drones.pop(0) yield from drone.shutdown() del drone yield self.env.timeout(1) def drone_demand(self): - return len(self._drones) + len(self._drones_in_use) + return len(self.drones) @property def allocation(self) -> float: allocations = [] - for drone in self.drones(): + for drone in self.drones: allocations.append(drone.allocation) try: return sum(allocations) / len(allocations) @@ -75,7 +74,7 @@ def allocation(self) -> float: @property def utilisation(self) -> float: utilisations = [] - for drone in self.drones(): + for drone in self.drones: utilisations.append(drone.utilisation) try: return sum(utilisations) / len(utilisations) @@ -85,7 +84,7 @@ def utilisation(self) -> float: @property def supply(self): supply = 0 - for drone in self.drones(): + for drone in self.drones: supply += drone.supply return supply @@ -100,25 +99,7 @@ def demand(self, value): else: self._demand = 0 - def add_drone(self, drone): - try: - self._drones_in_use.remove(drone) - except ValueError: - pass - def drone_ready(self, drone): # print("[drone %s] is ready at %d" % (drone, self.env.now)) - self._drones.append(drone) + self.drones.append(drone) yield self.put(1) - - def get_drone(self, amount): - yield self.get(amount) - drone = self._drones.pop(0) - self._drones_in_use.append(drone) - return drone - - def drones(self): - for drone in self._drones: - yield drone - for drone in self._drones_in_use: - yield drone diff --git a/scheduler.py b/scheduler.py index 30d574c..15e626d 100644 --- a/scheduler.py +++ b/scheduler.py @@ -9,3 +9,65 @@ def job_scheduler(env): env.process(drone.start_job(*next(globals.job_generator))) yield env.timeout(0) yield env.timeout(1) + + +def htcondor_job_scheduler(env): + """ + Goal of the htcondor job scheduler is to have a scheduler that somehow mimics how htcondor does schedule jobs. + Htcondor does scheduling based on a priority queue. The priorities itself are managed by operators of htcondor. + So different instances can apparently behave very different. + + In my case I am going to try building a priority queue that sorts job slots by increasing cost. The cost itself + is calculated based on the current strategy that is used at GridKa. The scheduler checks if a job either + exactly fits a slot or if it does fit into it several times. The cost for putting a job at a given slot is + given by the amount of resources that might remain unallocated. + :param env: + :return: + """ + def schedule_pool(job): + priorities = {} + for pool in globals.pools: + for drone in pool.drones: + cost = 0 + resource_types = {*drone.resources.keys(), *job[1].keys()} + for resource_type in resource_types: + if resource_type not in drone.resources.keys(): + cost = float("Inf") + elif (pool.resources[resource_type] - drone.resources[resource_type]) < \ + job[1][resource_type]: + cost = float("Inf") + else: + cost += (pool.resources[resource_type] - drone.resources[resource_type]) // \ + job[1][resource_type] + cost /= len(resource_types) + if cost <= 1: + # directly start stuff + return drone + try: + priorities[cost].append(drone) + except KeyError: + priorities[cost] = [drone] + try: + minimal_key = min(priorities) + if minimal_key < float("Inf"): + return priorities[minimal_key][0] + except ValueError: + pass + return None + + current_job = None + while True: + if not current_job: + current_job = next(globals.job_generator) + if globals.global_demand.level > 0: + best_match = schedule_pool(current_job) + if best_match: + env.process(best_match.start_job(*current_job)) + current_job = None + yield env.timeout(0) + else: + yield env.timeout(1) + else: + yield env.timeout(1) + + From afa7b26de1be65d147e081988a2e81d2e44d63de Mon Sep 17 00:00:00 2001 From: "matthias.schnepf" Date: Mon, 20 Aug 2018 16:28:01 +0200 Subject: [PATCH 012/648] update controller due to cobald commit https://github.com/MaineKuehn/cobald/commit/7108f7bff9fa5d6c9e5d566f0e44f88a2197cb43#diff-ea5ffbd14c2ae4836b4ca0400af9ecee --- controller.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/controller.py b/controller.py index 51e09b4..3bde38c 100644 --- a/controller.py +++ b/controller.py @@ -3,14 +3,15 @@ class SimulatedLinearController(LinearController): - def __init__(self, env, target: Pool, low_utilisation=0.5, high_allocation=0.5, rate=1): - super(SimulatedLinearController, self).__init__(target, low_utilisation, high_allocation, rate) + def __init__(self, env, target: Pool, low_utilisation=0.5, high_allocation=0.5, rate=1, interval=1): + super(SimulatedLinearController, self).__init__(target, low_utilisation, high_allocation, rate, interval) self.env = env self.action = env.process(self.run()) + self._interval = interval def run(self): while True: - self.regulate_demand() + self.regulate(interval=self._interval) # print("[controller] demand %d -> %d, supply %d (global %d), allocation %.2f, utilisation %.2f " # "(available %d)" % (pre_demand, self.target.demand, self.target.supply, globals.global_demand.level, # self.target.allocation, self.target.utilisation, self.target.level)) From aa0331527bce137a84d28e7ebe20a16c38744abf Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Mon, 20 Aug 2018 20:28:22 +0200 Subject: [PATCH 013/648] added generator for job properties from htcondor export --- job.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/job.py b/job.py index 62439d9..a97542b 100644 --- a/job.py +++ b/job.py @@ -1,5 +1,7 @@ import random import math +import csv + import globals @@ -48,3 +50,15 @@ def __iter__(self): def job_property_generator(): while True: yield 10, {"memory": 8, "cores": 1, "disk": 100} + + +def htcondor_export_job_generator(filename): + with open(filename, "r") as input_file: + htcondor_reader = csv.reader(input_file, delimiter=' ', quotechar="'") + header = next(htcondor_reader) + for row in htcondor_reader: + yield 10, { + "cores": int(row[header.index("RequestCpus")]), + "disk": int(row[header.index("RequestDisk")]), + "memory": float(row[header.index("RequestMemory")]) + } From 02dacdfd6a2fecf81366bffbe0f7f822e9abc990 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Mon, 20 Aug 2018 21:45:18 +0200 Subject: [PATCH 014/648] information about used resources can now be specified for jobs --- drone.py | 2 +- job.py | 5 ++++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/drone.py b/drone.py index dcb3e39..25ba3a0 100644 --- a/drone.py +++ b/drone.py @@ -47,7 +47,7 @@ def shutdown(self): yield self.env.timeout(1) # print("[drone %s] has been shut down" % self) - def start_job(self, walltime, resources): + def start_job(self, walltime, resources, used_resources=None): for resource_key in resources: if self.resources[resource_key] + resources[resource_key]: # TODO: kill job diff --git a/job.py b/job.py index a97542b..53d1b51 100644 --- a/job.py +++ b/job.py @@ -35,7 +35,7 @@ def job_demand(env): class Job(object): - def __init__(self, env, walltime, resources): + def __init__(self, env, walltime, resources, used_resources=None): self.env = env self.resources = resources self.walltime = walltime @@ -61,4 +61,7 @@ def htcondor_export_job_generator(filename): "cores": int(row[header.index("RequestCpus")]), "disk": int(row[header.index("RequestDisk")]), "memory": float(row[header.index("RequestMemory")]) + }, { + "memory": float(row[header.index("MemoryUsage")]), + "disk": int(row[header.index("DiskUsage_RAW")]) } From 9273b68304e0c54cd9bf4a3a96c0e7e96e5ca76f Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Mon, 20 Aug 2018 22:31:32 +0200 Subject: [PATCH 015/648] added pool supply to monitoring --- main.py | 38 ++++++++++++++++++++++++++------------ pool.py | 7 +++++++ 2 files changed, 33 insertions(+), 12 deletions(-) diff --git a/main.py b/main.py index ea3b982..f2cb23c 100644 --- a/main.py +++ b/main.py @@ -8,7 +8,7 @@ from cost import cobald_cost from job import job_demand, job_property_generator from scheduler import job_scheduler, htcondor_job_scheduler -from pool import Pool, pool_demands, pool_utilisation, pool_allocation, pool_unused +from pool import Pool, pool_demands, pool_supplys, pool_utilisation, pool_allocation, pool_unused from controller import SimulatedLinearController @@ -35,6 +35,7 @@ def monitor(data, t, prio, eid, event): tmp = round(t) data[tmp]["user_demand"] = globals.global_demand.level data[tmp]["pool_demand"] = pool_demands() + data[tmp]["pool_supply"] = pool_supplys() data[tmp]["pool_utilisation"] = pool_utilisation() data[tmp]["pool_allocation"] = pool_allocation() data[tmp]["pool_unused"] = pool_unused() * -1 @@ -54,30 +55,43 @@ def main(): random.seed(1234) env = simpy.Environment() trace(env, monitor_data) - globals.job_generator = job_property_generator() - for i in range(10): - pool = Pool(env) + #globals.job_generator = job_property_generator() + globals.job_generator = htcondor_export_job_generator("condor_usage.csv") + for resources in [{"memory": 5000, "cores": 1}, {"memory": 24000, "cores": 8}, {"memory": 16000, "cores": 4}]: + pool = Pool(env, resources=resources) globals.pools.append(pool) - SimulatedLinearController(env, target=pool, rate=1) + SimulatedCostController(env, target=pool, rate=1) globals.global_demand = simpy.Container(env) env.process(job_demand(env)) env.process(htcondor_job_scheduler(env)) env.run(until=1000) # Plotting some first results - plt.plot(globals.monitoring_data.keys(), [value.get("user_demand", None) for value in globals.monitoring_data.values()]) plt.plot(globals.monitoring_data.keys(), - [value.get("user_demand_new", None) for value in globals.monitoring_data.values()], - 'ro') + [value.get("user_demand", None) for value in globals.monitoring_data.values()], + label="Accumulated demand") plt.plot(globals.monitoring_data.keys(), - [value.get("pool_demand", None) for value in globals.monitoring_data.values()]) + [value.get("user_demand_new", None) for value in globals.monitoring_data.values()], + 'ro', + label="Current demand") + plt.plot(globals.monitoring_data.keys(), + [value.get("pool_demand", None) for value in globals.monitoring_data.values()], + label="Pool demand") + plt.plot(globals.monitoring_data.keys(), + [value.get("pool_supply", None) for value in globals.monitoring_data.values()], + label="Pool supply") + plt.legend() plt.show() plt.plot(globals.monitoring_data.keys(), - [value.get("pool_utilisation", None) for value in globals.monitoring_data.values()]) + [value.get("pool_utilisation", None) for value in globals.monitoring_data.values()], + label="Pool utilisation") plt.plot(globals.monitoring_data.keys(), - [value.get("pool_allocation", None) for value in globals.monitoring_data.values()]) + [value.get("pool_allocation", None) for value in globals.monitoring_data.values()], + label="Pool allocation") plt.plot(globals.monitoring_data.keys(), - [value.get("pool_unused", None) for value in globals.monitoring_data.values()]) + [value.get("pool_unused", None) for value in globals.monitoring_data.values()], + label="Wasted resources") + plt.legend() plt.show() fig, ax1 = plt.subplots() diff --git a/pool.py b/pool.py index 65052a7..92b50ec 100644 --- a/pool.py +++ b/pool.py @@ -12,6 +12,13 @@ def pool_demands(): return result +def pool_supplys(): + result = 0 + for pool in globals.pools: + result += pool.supply + return result + + def pool_utilisation(): result = [] for pool in globals.pools: From 77233559a27b2a94e84a702a97e7e7031bc24ba3 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Mon, 20 Aug 2018 23:15:49 +0200 Subject: [PATCH 016/648] added plot to show different utilisation of pools --- main.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/main.py b/main.py index f2cb23c..ab767b2 100644 --- a/main.py +++ b/main.py @@ -44,7 +44,8 @@ def monitor(data, t, prio, eid, event): globals.cost += current_cost data[tmp]["acc_cost"] = globals.cost last_step = tmp - # for pool in globals.pools: + for pool in globals.pools: + data[tmp]["pool_%s_supply" % pool] = pool.supply # print("%s [Pool %s] drones %d, demand %d, supply %d (%d); allocation %.2f, utilisation %.2f" % ( # tmp, pool, len(pool.drones), pool.demand, pool.supply, pool.level, pool.allocation, pool.utilisation)) @@ -94,6 +95,13 @@ def main(): plt.legend() plt.show() + for index, pool in enumerate(globals.pools): + plt.plot(globals.monitoring_data.keys(), + [value.get("pool_%s_supply" % pool, None) for value in globals.monitoring_data.values()], + label="Pool %d supply" % index) + plt.legend() + plt.show() + fig, ax1 = plt.subplots() ax1.plot(globals.monitoring_data.keys(), [value.get("cost", None) for value in globals.monitoring_data.values()], 'b-') From 2fc1d950c778fb142372df8405b6ff602256433f Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Mon, 20 Aug 2018 23:34:10 +0200 Subject: [PATCH 017/648] interval does not need to be stored explicitely --- controller.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/controller.py b/controller.py index 3bde38c..769e8aa 100644 --- a/controller.py +++ b/controller.py @@ -7,12 +7,11 @@ def __init__(self, env, target: Pool, low_utilisation=0.5, high_allocation=0.5, super(SimulatedLinearController, self).__init__(target, low_utilisation, high_allocation, rate, interval) self.env = env self.action = env.process(self.run()) - self._interval = interval def run(self): while True: - self.regulate(interval=self._interval) + self.regulate(interval=self.interval) # print("[controller] demand %d -> %d, supply %d (global %d), allocation %.2f, utilisation %.2f " # "(available %d)" % (pre_demand, self.target.demand, self.target.supply, globals.global_demand.level, # self.target.allocation, self.target.utilisation, self.target.level)) - yield self.env.timeout(self._interval) + yield self.env.timeout(self.interval) From 684dc1c818e62ab53e6326881d5059e85edf85d1 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Mon, 20 Aug 2018 23:42:37 +0200 Subject: [PATCH 018/648] now also walltime goes into simulation --- job.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/job.py b/job.py index 53d1b51..cb56bbc 100644 --- a/job.py +++ b/job.py @@ -57,7 +57,7 @@ def htcondor_export_job_generator(filename): htcondor_reader = csv.reader(input_file, delimiter=' ', quotechar="'") header = next(htcondor_reader) for row in htcondor_reader: - yield 10, { + yield float(row[header.index("RemoteWallClockTime")]), { "cores": int(row[header.index("RequestCpus")]), "disk": int(row[header.index("RequestDisk")]), "memory": float(row[header.index("RequestMemory")]) From 6515d8461d7e4b897b3cb5dfd62ee6e61f8bd7ad Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Mon, 20 Aug 2018 23:52:15 +0200 Subject: [PATCH 019/648] added information on number of running jobs to drone --- drone.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drone.py b/drone.py index 25ba3a0..7a3cd0f 100644 --- a/drone.py +++ b/drone.py @@ -11,6 +11,7 @@ def __init__(self, env, pool, scheduling_duration): self.action = env.process(self.run(scheduling_duration)) self.resources = {resource: 0 for resource in self.pool.resources} self._supply = 0 + self.jobs = 0 def run(self, scheduling_duration): yield self.env.timeout(scheduling_duration) @@ -54,7 +55,9 @@ def start_job(self, walltime, resources, used_resources=None): pass for resource_key in resources: self.resources[resource_key] += resources[resource_key] + self.jobs += 1 yield from Job(self.env, walltime, resources) + self.jobs -= 1 for resource_key in resources: self.resources[resource_key] -= resources[resource_key] # put drone back into pool queue From bae790ead3bf4137f57d0dc1d3466384b789a0bd Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Mon, 20 Aug 2018 23:53:47 +0200 Subject: [PATCH 020/648] added number of running jobs to plot --- main.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/main.py b/main.py index ab767b2..d374278 100644 --- a/main.py +++ b/main.py @@ -8,7 +8,7 @@ from cost import cobald_cost from job import job_demand, job_property_generator from scheduler import job_scheduler, htcondor_job_scheduler -from pool import Pool, pool_demands, pool_supplys, pool_utilisation, pool_allocation, pool_unused +from pool import Pool, pool_demands, pool_supplys, pool_utilisation, pool_allocation, pool_unused, pool_jobs from controller import SimulatedLinearController @@ -39,6 +39,7 @@ def monitor(data, t, prio, eid, event): data[tmp]["pool_utilisation"] = pool_utilisation() data[tmp]["pool_allocation"] = pool_allocation() data[tmp]["pool_unused"] = pool_unused() * -1 + data[tmp]["pool_jobs"] = pool_jobs() current_cost = cobald_cost() data[tmp]["cost"] = current_cost globals.cost += current_cost @@ -81,6 +82,9 @@ def main(): plt.plot(globals.monitoring_data.keys(), [value.get("pool_supply", None) for value in globals.monitoring_data.values()], label="Pool supply") + plt.plot(globals.monitoring_data.keys(), + [value.get("pool_jobs", None) for value in globals.monitoring_data.values()], + label="Running jobs") plt.legend() plt.show() plt.plot(globals.monitoring_data.keys(), @@ -118,6 +122,7 @@ def main(): fig.tight_layout() plt.show() + print("final cost: %.2f" % globals.monitoring_data[sorted(globals.monitoring_data.keys())[-1]]["acc_cost"]) if __name__ == "__main__": From f4b1fd34c35f94b6643abf5c1df9f1f8ba82a7b8 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Wed, 22 Aug 2018 05:40:16 +0200 Subject: [PATCH 021/648] made monitoring more efficient and added resource plot ala roced --- main.py | 75 +++++++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 54 insertions(+), 21 deletions(-) diff --git a/main.py b/main.py index d374278..7320f14 100644 --- a/main.py +++ b/main.py @@ -8,17 +8,17 @@ from cost import cobald_cost from job import job_demand, job_property_generator from scheduler import job_scheduler, htcondor_job_scheduler -from pool import Pool, pool_demands, pool_supplys, pool_utilisation, pool_allocation, pool_unused, pool_jobs +from pool import Pool from controller import SimulatedLinearController -def trace(env, callback): +def trace(env, callback, resource_normalisation): def get_wrapper(env_step, callback): @wraps(env_step) def tracing_step(): if len(env._queue): t, prio, eid, event = env._queue[0] - callback(t, prio, eid, event) + callback(t, prio, eid, event, resource_normalisation) return env_step() return tracing_step env.step = get_wrapper(env.step, callback) @@ -27,26 +27,49 @@ def tracing_step(): last_step = 0 -def monitor(data, t, prio, eid, event): +def monitor(data, t, prio, eid, event, resource_normalisation): global last_step - # TODO: do relevant monitoring if t > last_step: # new data to be recorded tmp = round(t) - data[tmp]["user_demand"] = globals.global_demand.level - data[tmp]["pool_demand"] = pool_demands() - data[tmp]["pool_supply"] = pool_supplys() - data[tmp]["pool_utilisation"] = pool_utilisation() - data[tmp]["pool_allocation"] = pool_allocation() - data[tmp]["pool_unused"] = pool_unused() * -1 - data[tmp]["pool_jobs"] = pool_jobs() - current_cost = cobald_cost() - data[tmp]["cost"] = current_cost - globals.cost += current_cost - data[tmp]["acc_cost"] = globals.cost last_step = tmp + pool_demand = 0 + pool_supply = 0 + pool_utilisation = 0 + pool_allocation = 0 + running_jobs = 0 + used_resources = 0 + unused_resources = 0 + empty_drones = 0 + result = {} for pool in globals.pools: - data[tmp]["pool_%s_supply" % pool] = pool.supply + pool_demand += pool.demand + pool_supply += pool.supply + result["pool_%s_supply" % pool] = pool.supply + pool_utilisation += pool.utilisation + pool_allocation += pool.allocation + for drone in pool.drones: + running_jobs += drone.jobs + if drone.allocation == 0: + empty_drones += 1 + for resource_key, usage in drone.resources.items(): + normalisation_factor = resource_normalisation.get(resource_key, 1) + used_resources += usage / normalisation_factor + unused_resources += (pool.resources[resource_key] - usage) / normalisation_factor + result["user_demand"] = globals.global_demand.level + result["pool_demand"] = pool_demand + result["pool_supply"] = pool_supply + result["pool_utilisation"] = pool_utilisation + result["pool_allocation"] = pool_allocation + result["running_jobs"] = running_jobs + result["empty_drones"] = empty_drones + result["used_resources"] = used_resources + result["unused_resources"] = unused_resources + cost = cobald_cost() + result["cost"] = cost + globals.cost += cost + result["acc_cost"] = globals.cost + globals.monitoring_data[tmp].update(result) # print("%s [Pool %s] drones %d, demand %d, supply %d (%d); allocation %.2f, utilisation %.2f" % ( # tmp, pool, len(pool.drones), pool.demand, pool.supply, pool.level, pool.allocation, pool.utilisation)) @@ -56,7 +79,7 @@ def main(): random.seed(1234) env = simpy.Environment() - trace(env, monitor_data) + trace(env, monitor_data, resource_normalisation={"memory": 2000}) #globals.job_generator = job_property_generator() globals.job_generator = htcondor_export_job_generator("condor_usage.csv") for resources in [{"memory": 5000, "cores": 1}, {"memory": 24000, "cores": 8}, {"memory": 16000, "cores": 4}]: @@ -83,7 +106,7 @@ def main(): [value.get("pool_supply", None) for value in globals.monitoring_data.values()], label="Pool supply") plt.plot(globals.monitoring_data.keys(), - [value.get("pool_jobs", None) for value in globals.monitoring_data.values()], + [value.get("running_jobs", None) for value in globals.monitoring_data.values()], label="Running jobs") plt.legend() plt.show() @@ -94,8 +117,8 @@ def main(): [value.get("pool_allocation", None) for value in globals.monitoring_data.values()], label="Pool allocation") plt.plot(globals.monitoring_data.keys(), - [value.get("pool_unused", None) for value in globals.monitoring_data.values()], - label="Wasted resources") + [value.get("empty_drones", None) for value in globals.monitoring_data.values()], + label="Unallocated drones") plt.legend() plt.show() @@ -122,6 +145,16 @@ def main(): fig.tight_layout() plt.show() + + # resource plot for max + plt.plot(globals.monitoring_data.keys(), + [value.get("unused_resources", None) for value in globals.monitoring_data.values()], + label="Unused") + plt.plot(globals.monitoring_data.keys(), + [value.get("used_resources", None) for value in globals.monitoring_data.values()], + label="used") + plt.legend() + plt.show() print("final cost: %.2f" % globals.monitoring_data[sorted(globals.monitoring_data.keys())[-1]]["acc_cost"]) From 6eb292a0780c35a657f6f40e02193e8cd61bee69 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Wed, 22 Aug 2018 06:05:17 +0200 Subject: [PATCH 022/648] plot now also contains ratio of available and unused resources --- main.py | 26 +++++++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-) diff --git a/main.py b/main.py index 7320f14..aab9c83 100644 --- a/main.py +++ b/main.py @@ -40,6 +40,7 @@ def monitor(data, t, prio, eid, event, resource_normalisation): running_jobs = 0 used_resources = 0 unused_resources = 0 + available_resources = 0 empty_drones = 0 result = {} for pool in globals.pools: @@ -56,6 +57,7 @@ def monitor(data, t, prio, eid, event, resource_normalisation): normalisation_factor = resource_normalisation.get(resource_key, 1) used_resources += usage / normalisation_factor unused_resources += (pool.resources[resource_key] - usage) / normalisation_factor + available_resources += pool.resources[resource_key] / normalisation_factor result["user_demand"] = globals.global_demand.level result["pool_demand"] = pool_demand result["pool_supply"] = pool_supply @@ -65,6 +67,7 @@ def monitor(data, t, prio, eid, event, resource_normalisation): result["empty_drones"] = empty_drones result["used_resources"] = used_resources result["unused_resources"] = unused_resources + result["available_resources"] = available_resources cost = cobald_cost() result["cost"] = cost globals.cost += cost @@ -147,14 +150,27 @@ def main(): plt.show() # resource plot for max - plt.plot(globals.monitoring_data.keys(), + fig, ax = plt.subplots(2, sharex=True) + ax[0].plot(globals.monitoring_data.keys(), [value.get("unused_resources", None) for value in globals.monitoring_data.values()], label="Unused") - plt.plot(globals.monitoring_data.keys(), + ax[0].plot(globals.monitoring_data.keys(), [value.get("used_resources", None) for value in globals.monitoring_data.values()], - label="used") - plt.legend() - plt.show() + label="Used") + ax[0].set_title("Resource utilisation") + ax[0].legend() + percentages = [] + percentage_means = [] + for value in globals.monitoring_data.values(): + try: + percentages.append(value.get("unused_resources", None) / value.get("available_resources", None)) + except ZeroDivisionError: + percentages.append(1) + percentage_means.append(sum(percentages) / len(percentages)) + ax[1].plot(globals.monitoring_data.keys(), percentages) + ax[1].plot(globals.monitoring_data.keys(), percentage_means, label="mean") + ax[1].set_title("Percentage of unused resources") + fig.show() print("final cost: %.2f" % globals.monitoring_data[sorted(globals.monitoring_data.keys())[-1]]["acc_cost"]) From 9252a745e880df46b3aeb27b88ba413f1decbab4 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Wed, 22 Aug 2018 10:36:43 +0200 Subject: [PATCH 023/648] utilisation and allocation for now defined based on min and max --- drone.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/drone.py b/drone.py index 7a3cd0f..dc49407 100644 --- a/drone.py +++ b/drone.py @@ -32,16 +32,17 @@ def demand(self, value): @property def utilisation(self): - result = 0 - for resource in self.resources: - result += self.resources[resource] / self.pool.resources[resource] - return result / len(self.resources) - #return min((self._memory / self.pool.memory), (self._disk / self.pool.disk), (self._cores / self.pool.cores)) + resources = [] + for resource_key, value in self.resources.items(): + resources.append(value / self.pool.resources[resource_key]) + return min(resources) @property def allocation(self): - return sum(self.resources.values()) > 0 - #return max((self._memory / self.pool.memory), (self._disk / self.pool.disk), (self._cores / self.pool.cores)) + resources = [] + for resource_key, value in self.resources.items(): + resources.append(value / self.pool.resources[resource_key]) + return max(resources) def shutdown(self): self._supply = 0 From c8aa4ea54050b423dbe7d21613308b2ff8f48e74 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Wed, 22 Aug 2018 10:42:26 +0200 Subject: [PATCH 024/648] condor scheduler now uses realistic timings --- scheduler.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scheduler.py b/scheduler.py index 15e626d..7a011cb 100644 --- a/scheduler.py +++ b/scheduler.py @@ -66,8 +66,8 @@ def schedule_pool(job): current_job = None yield env.timeout(0) else: - yield env.timeout(1) + yield env.timeout(60) else: - yield env.timeout(1) + yield env.timeout(60) From 05ace0f1318e305ea0ff6fb66991af8b744c08f9 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Wed, 22 Aug 2018 11:24:36 +0200 Subject: [PATCH 025/648] improved condor scheduling --- scheduler.py | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/scheduler.py b/scheduler.py index 7a011cb..5f3316a 100644 --- a/scheduler.py +++ b/scheduler.py @@ -36,12 +36,13 @@ def schedule_pool(job): elif (pool.resources[resource_type] - drone.resources[resource_type]) < \ job[1][resource_type]: cost = float("Inf") + break else: cost += (pool.resources[resource_type] - drone.resources[resource_type]) // \ job[1][resource_type] cost /= len(resource_types) if cost <= 1: - # directly start stuff + # directly start job return drone try: priorities[cost].append(drone) @@ -55,19 +56,32 @@ def schedule_pool(job): pass return None + unscheduled_jobs = [] current_job = None + postponed_unmatched_job = False while True: - if not current_job: + if not postponed_unmatched_job and len(unscheduled_jobs) > 0: + for job in unscheduled_jobs: + best_match = schedule_pool(job) + if best_match: + env.process(best_match.start_job(*job)) + unscheduled_jobs.remove(job) + yield env.timeout(0) + if not current_job and globals.global_demand.level - len(unscheduled_jobs) > 0: current_job = next(globals.job_generator) - if globals.global_demand.level > 0: + if current_job: best_match = schedule_pool(current_job) if best_match: env.process(best_match.start_job(*current_job)) current_job = None yield env.timeout(0) else: - yield env.timeout(60) + postponed_unmatched_job = True + unscheduled_jobs.append(current_job) + current_job = None + yield env.timeout(0) else: + postponed_unmatched_job = False yield env.timeout(60) From f2886297f5421449e9b2a28eaa3ef4cfaa0e8214 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Wed, 22 Aug 2018 18:49:21 +0200 Subject: [PATCH 026/648] adapted pool to only remove drones that do not run jobs and allow to start several drones in a time step --- pool.py | 57 ++++++++++++--------------------------------------------- 1 file changed, 12 insertions(+), 45 deletions(-) diff --git a/pool.py b/pool.py index 92b50ec..2367964 100644 --- a/pool.py +++ b/pool.py @@ -1,51 +1,11 @@ from simpy.resources import container from cobald import interfaces -import globals from drone import Drone -def pool_demands(): - result = 0 - for pool in globals.pools: - result += pool.demand - return result - - -def pool_supplys(): - result = 0 - for pool in globals.pools: - result += pool.supply - return result - - -def pool_utilisation(): - result = [] - for pool in globals.pools: - for drone in pool.drones: - result.append(drone.utilisation) - return sum(result) - - -def pool_allocation(): - result = [] - for pool in globals.pools: - for drone in pool.drones: - result.append(drone.allocation) - return sum(result) - - -def pool_unused(): - result = 0 - for pool in globals.pools: - for drone in pool.drones: - if drone.allocation == 0: - result += 1 - return result - - class Pool(interfaces.Pool, container.Container): - def __init__(self, env, capacity=float('inf'), init=0, resources={"memory": 8, "cores": 1, "disk": 100}): + def __init__(self, env, capacity=float('inf'), init=0, resources={"memory": 8, "cores": 1}): super(Pool, self).__init__(env, capacity, init) self.resources = resources self._demand = 0 @@ -55,12 +15,20 @@ def __init__(self, env, capacity=float('inf'), init=0, resources={"memory": 8, " def run(self): while True: - if self.drone_demand() < self._demand: + drones_required = self._demand - self.level + while drones_required > 0: + drones_required -= 1 # start a new drone Drone(self.env, self, 10) - elif self.drone_demand() > self._demand: + yield self.put(1) + if self.level > self._demand: + for drone in self.drones: + if drone.jobs == 0: + break + else: + break yield self.get(1) - drone = self.drones.pop(0) + self.drones.remove(drone) yield from drone.shutdown() del drone yield self.env.timeout(1) @@ -109,4 +77,3 @@ def demand(self, value): def drone_ready(self, drone): # print("[drone %s] is ready at %d" % (drone, self.env.now)) self.drones.append(drone) - yield self.put(1) From 31fa3aef61aeeea3e0f0056475cef5a01c02b642 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Wed, 22 Aug 2018 18:49:46 +0200 Subject: [PATCH 027/648] added an experimental controller --- controller.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/controller.py b/controller.py index 769e8aa..a1cc79c 100644 --- a/controller.py +++ b/controller.py @@ -15,3 +15,24 @@ def run(self): # "(available %d)" % (pre_demand, self.target.demand, self.target.supply, globals.global_demand.level, # self.target.allocation, self.target.utilisation, self.target.level)) yield self.env.timeout(self.interval) + + +class SimulatedCostController(SimulatedLinearController): + def __init__(self, env, target: Pool, low_utilisation=0.5, high_allocation=0.5, rate=1, interval=1): + self.current_cost = 1 + super(SimulatedCostController, self).__init__(env, target, low_utilisation, high_allocation, rate, interval) + + def regulate(self, interval): + allocation = 0 + for drone in self.target.drones: + allocation += drone.allocation + if self.target.supply - allocation <= 1: + if self.target.utilisation >= .8: + self.target.demand = int(allocation + self.current_cost) + self.current_cost += 1 + else: + self.target.demand = allocation + if self.current_cost > 1: + self.current_cost -= 1 + # else: + # self.target.demand = allocation From 008d585aa1a106054444771444fe4163a03fa64d Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Wed, 22 Aug 2018 18:50:11 +0200 Subject: [PATCH 028/648] drone does not need to yield from drone ready anymore --- drone.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drone.py b/drone.py index dc49407..6e880c4 100644 --- a/drone.py +++ b/drone.py @@ -16,7 +16,7 @@ def __init__(self, env, pool, scheduling_duration): def run(self, scheduling_duration): yield self.env.timeout(scheduling_duration) self._supply = 1 - yield from self.pool.drone_ready(self) + self.pool.drone_ready(self) @property def supply(self): From c847027f60a1c311ebc0237e7ebe54052d0b3ddd Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Wed, 22 Aug 2018 18:50:29 +0200 Subject: [PATCH 029/648] removed disk from job --- job.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/job.py b/job.py index cb56bbc..98b8552 100644 --- a/job.py +++ b/job.py @@ -59,7 +59,7 @@ def htcondor_export_job_generator(filename): for row in htcondor_reader: yield float(row[header.index("RemoteWallClockTime")]), { "cores": int(row[header.index("RequestCpus")]), - "disk": int(row[header.index("RequestDisk")]), + #"disk": int(row[header.index("RequestDisk")]), "memory": float(row[header.index("RequestMemory")]) }, { "memory": float(row[header.index("MemoryUsage")]), From ee8b1022241a1269ac89fc16db1110517605b1d2 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Wed, 22 Aug 2018 18:51:01 +0200 Subject: [PATCH 030/648] job property generator currently does not work anymore, so added todo --- scheduler.py | 1 + 1 file changed, 1 insertion(+) diff --git a/scheduler.py b/scheduler.py index 5f3316a..8a6dfb0 100644 --- a/scheduler.py +++ b/scheduler.py @@ -1,6 +1,7 @@ import globals +# TODO: does not work anymore as there is no method get_drone at pool def job_scheduler(env): while True: for pool in globals.pools: From 30cfeddb83fcfc65b1a5a079c336af2c31daefc9 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Wed, 22 Aug 2018 18:51:17 +0200 Subject: [PATCH 031/648] imports for new controller --- main.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/main.py b/main.py index aab9c83..d38a82d 100644 --- a/main.py +++ b/main.py @@ -6,10 +6,10 @@ import globals from cost import cobald_cost -from job import job_demand, job_property_generator -from scheduler import job_scheduler, htcondor_job_scheduler +from job import job_demand, job_property_generator, htcondor_export_job_generator +from scheduler import htcondor_job_scheduler from pool import Pool -from controller import SimulatedLinearController +from controller import SimulatedLinearController, SimulatedCostController def trace(env, callback, resource_normalisation): From 814738f3ef4e24391e798616132ef4aab901b527 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Thu, 20 Sep 2018 12:46:27 +0200 Subject: [PATCH 032/648] implementation to make demand and scheduler more explicit to prepare for static resources --- globals.py | 3 ++- job.py | 46 ++++++++++++++++++++++---------- main.py | 61 +++++++++++++++++++++++-------------------- scheduler.py | 74 +++++++++++++++++++++++++++++----------------------- 4 files changed, 109 insertions(+), 75 deletions(-) diff --git a/globals.py b/globals.py index 572daba..78f6476 100644 --- a/globals.py +++ b/globals.py @@ -1,7 +1,8 @@ from collections import defaultdict pools = [] -global_demand = None +job_queue = [] job_generator = None +job_scheduler = None monitoring_data = defaultdict(dict) # {tme: {variable: value, ...}} cost = 0 diff --git a/job.py b/job.py index 98b8552..6d25c1c 100644 --- a/job.py +++ b/job.py @@ -41,27 +41,45 @@ def __init__(self, env, walltime, resources, used_resources=None): self.walltime = walltime def __iter__(self): - # print("starting job at", self.env.now) - yield globals.global_demand.get(1) + # print(self, "starting job at", self.env.now, "with duration", self.walltime) yield self.env.timeout(self.walltime) - # print("job finished", self.env.now) + # print(self, "job finished", self.env.now) -def job_property_generator(): +def job_property_generator(**kwargs): while True: yield 10, {"memory": 8, "cores": 1, "disk": 100} -def htcondor_export_job_generator(filename): +def htcondor_export_job_generator(filename, job_queue, env=None, **kwargs): with open(filename, "r") as input_file: htcondor_reader = csv.reader(input_file, delimiter=' ', quotechar="'") header = next(htcondor_reader) - for row in htcondor_reader: - yield float(row[header.index("RemoteWallClockTime")]), { - "cores": int(row[header.index("RequestCpus")]), - #"disk": int(row[header.index("RequestDisk")]), - "memory": float(row[header.index("RequestMemory")]) - }, { - "memory": float(row[header.index("MemoryUsage")]), - "disk": int(row[header.index("DiskUsage_RAW")]) - } + row = next(htcondor_reader) + base_date = float(row[header.index("QDate")]) + current_time = 0 + + count = 0 + while True: + if not row: + row = next(htcondor_reader) + current_time = float(row[header.index("QDate")]) - base_date + if env.now >= current_time: + count += 1 + job_queue.append(( + float(row[header.index("RemoteWallClockTime")]), + { + "cores": int(row[header.index("RequestCpus")]), + # "disk": int(row[header.index("RequestDisk")]), + "memory": float(row[header.index("RequestMemory")]) + }, + { + "memory": float(row[header.index("MemoryUsage")]), + "disk": int(row[header.index("DiskUsage_RAW")]) + })) + row = None + else: + if count > 0: + globals.monitoring_data[round(env.now)]["user_demand_new"] = count + count = 0 + yield env.timeout(1) diff --git a/main.py b/main.py index d38a82d..9e9e6ef 100644 --- a/main.py +++ b/main.py @@ -6,10 +6,10 @@ import globals from cost import cobald_cost -from job import job_demand, job_property_generator, htcondor_export_job_generator -from scheduler import htcondor_job_scheduler +from job import job_demand, htcondor_export_job_generator +from scheduler import CondorJobScheduler from pool import Pool -from controller import SimulatedLinearController, SimulatedCostController +from controller import SimulatedCostController def trace(env, callback, resource_normalisation): @@ -58,7 +58,7 @@ def monitor(data, t, prio, eid, event, resource_normalisation): used_resources += usage / normalisation_factor unused_resources += (pool.resources[resource_key] - usage) / normalisation_factor available_resources += pool.resources[resource_key] / normalisation_factor - result["user_demand"] = globals.global_demand.level + result["user_demand"] = len(globals.job_queue) result["pool_demand"] = pool_demand result["pool_supply"] = pool_supply result["pool_utilisation"] = pool_utilisation @@ -77,23 +77,7 @@ def monitor(data, t, prio, eid, event, resource_normalisation): # tmp, pool, len(pool.drones), pool.demand, pool.supply, pool.level, pool.allocation, pool.utilisation)) -def main(): - monitor_data = partial(monitor, globals.monitoring_data) - - random.seed(1234) - env = simpy.Environment() - trace(env, monitor_data, resource_normalisation={"memory": 2000}) - #globals.job_generator = job_property_generator() - globals.job_generator = htcondor_export_job_generator("condor_usage.csv") - for resources in [{"memory": 5000, "cores": 1}, {"memory": 24000, "cores": 8}, {"memory": 16000, "cores": 4}]: - pool = Pool(env, resources=resources) - globals.pools.append(pool) - SimulatedCostController(env, target=pool, rate=1) - globals.global_demand = simpy.Container(env) - env.process(job_demand(env)) - env.process(htcondor_job_scheduler(env)) - env.run(until=1000) - +def generate_plots(): # Plotting some first results plt.plot(globals.monitoring_data.keys(), [value.get("user_demand", None) for value in globals.monitoring_data.values()], @@ -126,6 +110,7 @@ def main(): plt.show() for index, pool in enumerate(globals.pools): + print("pool", index, "has", pool.resources) plt.plot(globals.monitoring_data.keys(), [value.get("pool_%s_supply" % pool, None) for value in globals.monitoring_data.values()], label="Pool %d supply" % index) @@ -134,7 +119,7 @@ def main(): fig, ax1 = plt.subplots() ax1.plot(globals.monitoring_data.keys(), - [value.get("cost", None) for value in globals.monitoring_data.values()], 'b-') + [value.get("cost", None) for value in globals.monitoring_data.values()], 'b-') ax1.set_xlabel('Time') # Make the y-axis label, ticks and tick labels match the line color. ax1.set_ylabel('Cost', color='b') @@ -142,7 +127,7 @@ def main(): ax2 = ax1.twinx() ax2.plot(globals.monitoring_data.keys(), - [value.get("acc_cost", None) for value in globals.monitoring_data.values()], 'r.') + [value.get("acc_cost", None) for value in globals.monitoring_data.values()], 'r.') ax2.set_ylabel('Accumulated Cost', color='r') ax2.tick_params('y', colors='r') @@ -152,18 +137,18 @@ def main(): # resource plot for max fig, ax = plt.subplots(2, sharex=True) ax[0].plot(globals.monitoring_data.keys(), - [value.get("unused_resources", None) for value in globals.monitoring_data.values()], - label="Unused") + [value.get("unused_resources", None) for value in globals.monitoring_data.values()], + label="Unused") ax[0].plot(globals.monitoring_data.keys(), - [value.get("used_resources", None) for value in globals.monitoring_data.values()], - label="Used") + [value.get("used_resources", None) for value in globals.monitoring_data.values()], + label="Used") ax[0].set_title("Resource utilisation") ax[0].legend() percentages = [] percentage_means = [] for value in globals.monitoring_data.values(): try: - percentages.append(value.get("unused_resources", None) / value.get("available_resources", None)) + percentages.append(value.get("unused_resources", 0) / value.get("available_resources", 0)) except ZeroDivisionError: percentages.append(1) percentage_means.append(sum(percentages) / len(percentages)) @@ -171,6 +156,26 @@ def main(): ax[1].plot(globals.monitoring_data.keys(), percentage_means, label="mean") ax[1].set_title("Percentage of unused resources") fig.show() + + +def main(): + monitor_data = partial(monitor, globals.monitoring_data) + + random.seed(1234) + env = simpy.Environment() + trace(env, monitor_data, resource_normalisation={"memory": 2000}) + globals.job_generator = htcondor_export_job_generator(filename="condor_usage_sorted_filtered.csv", + job_queue=globals.job_queue, + env=env) + env.process(globals.job_generator) + for resources in [{"memory": 5000, "cores": 1}, {"memory": 24000, "cores": 8}, {"memory": 16000, "cores": 4}]: + pool = Pool(env, resources=resources) + globals.pools.append(pool) + SimulatedCostController(env, target=pool, rate=1) + globals.job_scheduler = CondorJobScheduler(env=env, job_queue=globals.job_queue) + env.run(until=2000) + + generate_plots() print("final cost: %.2f" % globals.monitoring_data[sorted(globals.monitoring_data.keys())[-1]]["acc_cost"]) diff --git a/scheduler.py b/scheduler.py index 8a6dfb0..3a4fd44 100644 --- a/scheduler.py +++ b/scheduler.py @@ -12,7 +12,7 @@ def job_scheduler(env): yield env.timeout(1) -def htcondor_job_scheduler(env): +class CondorJobScheduler(object): """ Goal of the htcondor job scheduler is to have a scheduler that somehow mimics how htcondor does schedule jobs. Htcondor does scheduling based on a priority queue. The priorities itself are managed by operators of htcondor. @@ -25,7 +25,47 @@ def htcondor_job_scheduler(env): :param env: :return: """ - def schedule_pool(job): + def __init__(self, env, job_queue): + self.env = env + self.job_queue = job_queue + self.action = env.process(self.run()) + + def run(self): + # current_job = None + # postponed_unmatched_job = False + while True: + for job in self.job_queue: + best_match = self._schedule_job(job) + if best_match: + self.env.process(best_match.start_job(*job)) + self.job_queue.remove(job) + yield self.env.timeout(0) + yield self.env.timeout(60) + # if not postponed_unmatched_job and len(self._unscheduled_jobs) > 0: + # for job in self._unscheduled_jobs: + # best_match = self._schedule_job(job) + # if best_match: + # self.env.process(best_match.start_job(*job)) + # self._unscheduled_jobs.remove(job) + # yield self.env.timeout(0) + # if not current_job: + # current_job = next(self.job_queue) + # if current_job: + # best_match = self._schedule_job(current_job) + # if best_match: + # self.env.process(best_match.start_job(*current_job)) + # current_job = None + # yield self.env.timeout(0) + # else: + # postponed_unmatched_job = True + # self._unscheduled_jobs.append(current_job) + # current_job = None + # yield self.env.timeout(0) + # else: + # postponed_unmatched_job = False + # yield self.env.timeout(60) + + def _schedule_job(self, job): priorities = {} for pool in globals.pools: for drone in pool.drones: @@ -56,33 +96,3 @@ def schedule_pool(job): except ValueError: pass return None - - unscheduled_jobs = [] - current_job = None - postponed_unmatched_job = False - while True: - if not postponed_unmatched_job and len(unscheduled_jobs) > 0: - for job in unscheduled_jobs: - best_match = schedule_pool(job) - if best_match: - env.process(best_match.start_job(*job)) - unscheduled_jobs.remove(job) - yield env.timeout(0) - if not current_job and globals.global_demand.level - len(unscheduled_jobs) > 0: - current_job = next(globals.job_generator) - if current_job: - best_match = schedule_pool(current_job) - if best_match: - env.process(best_match.start_job(*current_job)) - current_job = None - yield env.timeout(0) - else: - postponed_unmatched_job = True - unscheduled_jobs.append(current_job) - current_job = None - yield env.timeout(0) - else: - postponed_unmatched_job = False - yield env.timeout(60) - - From a31887b28b9cff7c6b314d7f22b633cd768dc7c2 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Thu, 20 Sep 2018 14:55:20 +0200 Subject: [PATCH 033/648] caching of allocation and utilisation at drone level, related to issue #4 --- drone.py | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/drone.py b/drone.py index 6e880c4..29dc791 100644 --- a/drone.py +++ b/drone.py @@ -12,6 +12,8 @@ def __init__(self, env, pool, scheduling_duration): self.resources = {resource: 0 for resource in self.pool.resources} self._supply = 0 self.jobs = 0 + self._allocation = None + self._utilisation = None def run(self, scheduling_duration): yield self.env.timeout(scheduling_duration) @@ -32,17 +34,22 @@ def demand(self, value): @property def utilisation(self): - resources = [] - for resource_key, value in self.resources.items(): - resources.append(value / self.pool.resources[resource_key]) - return min(resources) + if self._utilisation is None: + self._init_allocation_and_utilisation() + return self._utilisation @property def allocation(self): + if self._allocation is None: + self._init_allocation_and_utilisation() + return self._allocation + + def _init_allocation_and_utilisation(self): resources = [] for resource_key, value in self.resources.items(): resources.append(value / self.pool.resources[resource_key]) - return max(resources) + self._allocation = max(resources) + self._utilisation = min(resources) def shutdown(self): self._supply = 0 @@ -54,11 +61,15 @@ def start_job(self, walltime, resources, used_resources=None): if self.resources[resource_key] + resources[resource_key]: # TODO: kill job pass + self._utilisation = None + self._allocation = None for resource_key in resources: self.resources[resource_key] += resources[resource_key] self.jobs += 1 yield from Job(self.env, walltime, resources) self.jobs -= 1 + self._utilisation = None + self._allocation = None for resource_key in resources: self.resources[resource_key] -= resources[resource_key] # put drone back into pool queue From 2a56f33b30b0b954f66a0d007702081dc632725e Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Thu, 20 Sep 2018 15:39:30 +0200 Subject: [PATCH 034/648] removed connection between drone and pool to notify that drone has started in relation to issue #5 --- drone.py | 1 - pool.py | 29 ++++++++++++++++------------- 2 files changed, 16 insertions(+), 14 deletions(-) diff --git a/drone.py b/drone.py index 6e880c4..a839e06 100644 --- a/drone.py +++ b/drone.py @@ -16,7 +16,6 @@ def __init__(self, env, pool, scheduling_duration): def run(self, scheduling_duration): yield self.env.timeout(scheduling_duration) self._supply = 1 - self.pool.drone_ready(self) @property def supply(self): diff --git a/pool.py b/pool.py index 2367964..8c0f105 100644 --- a/pool.py +++ b/pool.py @@ -9,7 +9,7 @@ def __init__(self, env, capacity=float('inf'), init=0, resources={"memory": 8, " super(Pool, self).__init__(env, capacity, init) self.resources = resources self._demand = 0 - self.drones = [] + self._drones = [] self.env = env self.action = env.process(self.run()) @@ -19,27 +19,33 @@ def run(self): while drones_required > 0: drones_required -= 1 # start a new drone - Drone(self.env, self, 10) + self._drones.append(Drone(self.env, self, 10)) yield self.put(1) if self.level > self._demand: - for drone in self.drones: + for drone in self._drones: if drone.jobs == 0: break else: break yield self.get(1) - self.drones.remove(drone) + self._drones.remove(drone) yield from drone.shutdown() del drone yield self.env.timeout(1) + @property + def drones(self): + for drone in self._drones: + if drone.supply > 0: + yield drone + def drone_demand(self): - return len(self.drones) + return len(self._drones) @property def allocation(self) -> float: allocations = [] - for drone in self.drones: + for drone in self._drones: allocations.append(drone.allocation) try: return sum(allocations) / len(allocations) @@ -49,8 +55,9 @@ def allocation(self) -> float: @property def utilisation(self) -> float: utilisations = [] - for drone in self.drones: - utilisations.append(drone.utilisation) + for drone in self._drones: + if drone.allocation > 0: + utilisations.append(drone.utilisation) try: return sum(utilisations) / len(utilisations) except ZeroDivisionError: @@ -59,7 +66,7 @@ def utilisation(self) -> float: @property def supply(self): supply = 0 - for drone in self.drones: + for drone in self._drones: supply += drone.supply return supply @@ -73,7 +80,3 @@ def demand(self, value): self._demand = value else: self._demand = 0 - - def drone_ready(self, drone): - # print("[drone %s] is ready at %d" % (drone, self.env.now)) - self.drones.append(drone) From c392cbf6d8a10e580bb323a2b33626e684c1d859 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Thu, 20 Sep 2018 15:42:52 +0200 Subject: [PATCH 035/648] pool resources are now part of initialisation of drones, fixes #5 --- drone.py | 10 +++++----- pool.py | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/drone.py b/drone.py index a839e06..78f3064 100644 --- a/drone.py +++ b/drone.py @@ -4,12 +4,12 @@ class Drone(interfaces.Pool): - def __init__(self, env, pool, scheduling_duration): + def __init__(self, env, pool_resources, scheduling_duration): super(Drone, self).__init__() self.env = env - self.pool = pool + self.pool_resources = pool_resources self.action = env.process(self.run(scheduling_duration)) - self.resources = {resource: 0 for resource in self.pool.resources} + self.resources = {resource: 0 for resource in self.pool_resources} self._supply = 0 self.jobs = 0 @@ -33,14 +33,14 @@ def demand(self, value): def utilisation(self): resources = [] for resource_key, value in self.resources.items(): - resources.append(value / self.pool.resources[resource_key]) + resources.append(value / self.pool_resources[resource_key]) return min(resources) @property def allocation(self): resources = [] for resource_key, value in self.resources.items(): - resources.append(value / self.pool.resources[resource_key]) + resources.append(value / self.pool_resources[resource_key]) return max(resources) def shutdown(self): diff --git a/pool.py b/pool.py index 8c0f105..cf5e5b2 100644 --- a/pool.py +++ b/pool.py @@ -19,7 +19,7 @@ def run(self): while drones_required > 0: drones_required -= 1 # start a new drone - self._drones.append(Drone(self.env, self, 10)) + self._drones.append(Drone(self.env, self.resources, 10)) yield self.put(1) if self.level > self._demand: for drone in self._drones: From 0bf6eb7032d2a7a2fc0f83bc698d93e4a4beef55 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Thu, 20 Sep 2018 16:10:42 +0200 Subject: [PATCH 036/648] job is now an object in job_queue instead of tuple, related to issue #3 --- drone.py | 16 ++++++++-------- job.py | 16 +++++++--------- scheduler.py | 8 ++++---- 3 files changed, 19 insertions(+), 21 deletions(-) diff --git a/drone.py b/drone.py index 78f3064..9375c0f 100644 --- a/drone.py +++ b/drone.py @@ -48,17 +48,17 @@ def shutdown(self): yield self.env.timeout(1) # print("[drone %s] has been shut down" % self) - def start_job(self, walltime, resources, used_resources=None): - for resource_key in resources: - if self.resources[resource_key] + resources[resource_key]: + def start_job(self, job): + for resource_key in job.resources: + if self.resources[resource_key] + job.resources[resource_key]: # TODO: kill job pass - for resource_key in resources: - self.resources[resource_key] += resources[resource_key] + for resource_key in job.resources: + self.resources[resource_key] += job.resources[resource_key] self.jobs += 1 - yield from Job(self.env, walltime, resources) + yield from job.process() self.jobs -= 1 - for resource_key in resources: - self.resources[resource_key] -= resources[resource_key] + for resource_key in job.resources: + self.resources[resource_key] -= job.resources[resource_key] # put drone back into pool queue # print("[drone %s] finished job at %d" % (self, self.env.now)) diff --git a/job.py b/job.py index 6d25c1c..cf7085c 100644 --- a/job.py +++ b/job.py @@ -35,12 +35,12 @@ def job_demand(env): class Job(object): - def __init__(self, env, walltime, resources, used_resources=None): + def __init__(self, env, walltime, resources, used_resources=None, in_queue_since=0): self.env = env self.resources = resources - self.walltime = walltime + self.walltime = float(walltime) - def __iter__(self): + def process(self): # print(self, "starting job at", self.env.now, "with duration", self.walltime) yield self.env.timeout(self.walltime) # print(self, "job finished", self.env.now) @@ -66,17 +66,15 @@ def htcondor_export_job_generator(filename, job_queue, env=None, **kwargs): current_time = float(row[header.index("QDate")]) - base_date if env.now >= current_time: count += 1 - job_queue.append(( - float(row[header.index("RemoteWallClockTime")]), - { + job_queue.append(Job( + env, row[header.index("RemoteWallClockTime")], resources={ "cores": int(row[header.index("RequestCpus")]), # "disk": int(row[header.index("RequestDisk")]), "memory": float(row[header.index("RequestMemory")]) - }, - { + }, used_resources={ "memory": float(row[header.index("MemoryUsage")]), "disk": int(row[header.index("DiskUsage_RAW")]) - })) + }, in_queue_since=env.now)) row = None else: if count > 0: diff --git a/scheduler.py b/scheduler.py index 3a4fd44..f9fbc77 100644 --- a/scheduler.py +++ b/scheduler.py @@ -37,7 +37,7 @@ def run(self): for job in self.job_queue: best_match = self._schedule_job(job) if best_match: - self.env.process(best_match.start_job(*job)) + self.env.process(best_match.start_job(job)) self.job_queue.remove(job) yield self.env.timeout(0) yield self.env.timeout(60) @@ -70,17 +70,17 @@ def _schedule_job(self, job): for pool in globals.pools: for drone in pool.drones: cost = 0 - resource_types = {*drone.resources.keys(), *job[1].keys()} + resource_types = {*drone.resources.keys(), *job.resources.keys()} for resource_type in resource_types: if resource_type not in drone.resources.keys(): cost = float("Inf") elif (pool.resources[resource_type] - drone.resources[resource_type]) < \ - job[1][resource_type]: + job.resources[resource_type]: cost = float("Inf") break else: cost += (pool.resources[resource_type] - drone.resources[resource_type]) // \ - job[1][resource_type] + job.resources[resource_type] cost /= len(resource_types) if cost <= 1: # directly start job From 55d72078494f5508b35850fefe6963dda441caff Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Thu, 20 Sep 2018 16:11:22 +0200 Subject: [PATCH 037/648] removed unnecessary code --- scheduler.py | 23 ----------------------- 1 file changed, 23 deletions(-) diff --git a/scheduler.py b/scheduler.py index f9fbc77..8e750bf 100644 --- a/scheduler.py +++ b/scheduler.py @@ -41,29 +41,6 @@ def run(self): self.job_queue.remove(job) yield self.env.timeout(0) yield self.env.timeout(60) - # if not postponed_unmatched_job and len(self._unscheduled_jobs) > 0: - # for job in self._unscheduled_jobs: - # best_match = self._schedule_job(job) - # if best_match: - # self.env.process(best_match.start_job(*job)) - # self._unscheduled_jobs.remove(job) - # yield self.env.timeout(0) - # if not current_job: - # current_job = next(self.job_queue) - # if current_job: - # best_match = self._schedule_job(current_job) - # if best_match: - # self.env.process(best_match.start_job(*current_job)) - # current_job = None - # yield self.env.timeout(0) - # else: - # postponed_unmatched_job = True - # self._unscheduled_jobs.append(current_job) - # current_job = None - # yield self.env.timeout(0) - # else: - # postponed_unmatched_job = False - # yield self.env.timeout(60) def _schedule_job(self, job): priorities = {} From b13116cddc5990e047c1969a45d06bc129d418fb Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Thu, 20 Sep 2018 16:56:22 +0200 Subject: [PATCH 038/648] added possibility to get waiting time from job, fixes #3 --- job.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/job.py b/job.py index cf7085c..30bb221 100644 --- a/job.py +++ b/job.py @@ -39,10 +39,19 @@ def __init__(self, env, walltime, resources, used_resources=None, in_queue_since self.env = env self.resources = resources self.walltime = float(walltime) + self.in_queue_since = in_queue_since + self.in_queue_until = None + + @property + def waiting_time(self): + if self.in_queue_until is not None: + return self.in_queue_until - self.in_queue_since + return float("Inf") def process(self): + self.in_queue_until = self.env.now # print(self, "starting job at", self.env.now, "with duration", self.walltime) - yield self.env.timeout(self.walltime) + yield self.env.timeout(self.walltime, value=self) # print(self, "job finished", self.env.now) From e00d7100bec1636f721795cce5fd249d32129c1f Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Thu, 20 Sep 2018 16:57:08 +0200 Subject: [PATCH 039/648] added plot for distribution of waiting times for jobs --- main.py | 81 ++++++++++++++++++++++++++++++++++----------------------- 1 file changed, 49 insertions(+), 32 deletions(-) diff --git a/main.py b/main.py index 9e9e6ef..bda31a9 100644 --- a/main.py +++ b/main.py @@ -6,7 +6,7 @@ import globals from cost import cobald_cost -from job import job_demand, htcondor_export_job_generator +from job import job_demand, htcondor_export_job_generator, Job from scheduler import CondorJobScheduler from pool import Pool from controller import SimulatedCostController @@ -28,6 +28,12 @@ def tracing_step(): def monitor(data, t, prio, eid, event, resource_normalisation): + if event.value: + if isinstance(event.value, Job): + try: + globals.monitoring_data["job_waiting_times"].append(event.value.waiting_time) + except AttributeError: + globals.monitoring_data["job_waiting_times"] = [event.value.waiting_time] global last_step if t > last_step: # new data to be recorded @@ -72,62 +78,68 @@ def monitor(data, t, prio, eid, event, resource_normalisation): result["cost"] = cost globals.cost += cost result["acc_cost"] = globals.cost - globals.monitoring_data[tmp].update(result) + monitoring_data = globals.monitoring_data["timesteps"] + try: + monitoring_data[tmp].update(result) + except KeyError: + monitoring_data[tmp] = result # print("%s [Pool %s] drones %d, demand %d, supply %d (%d); allocation %.2f, utilisation %.2f" % ( # tmp, pool, len(pool.drones), pool.demand, pool.supply, pool.level, pool.allocation, pool.utilisation)) def generate_plots(): # Plotting some first results - plt.plot(globals.monitoring_data.keys(), - [value.get("user_demand", None) for value in globals.monitoring_data.values()], + keys = globals.monitoring_data["timesteps"].keys() + values = globals.monitoring_data["timesteps"].values() + plt.plot(keys, + [value.get("user_demand", None) for value in values], label="Accumulated demand") - plt.plot(globals.monitoring_data.keys(), - [value.get("user_demand_new", None) for value in globals.monitoring_data.values()], + plt.plot(keys, + [value.get("user_demand_new", None) for value in values], 'ro', label="Current demand") - plt.plot(globals.monitoring_data.keys(), - [value.get("pool_demand", None) for value in globals.monitoring_data.values()], + plt.plot(keys, + [value.get("pool_demand", None) for value in values], label="Pool demand") - plt.plot(globals.monitoring_data.keys(), - [value.get("pool_supply", None) for value in globals.monitoring_data.values()], + plt.plot(keys, + [value.get("pool_supply", None) for value in values], label="Pool supply") - plt.plot(globals.monitoring_data.keys(), - [value.get("running_jobs", None) for value in globals.monitoring_data.values()], + plt.plot(keys, + [value.get("running_jobs", None) for value in values], label="Running jobs") plt.legend() plt.show() - plt.plot(globals.monitoring_data.keys(), - [value.get("pool_utilisation", None) for value in globals.monitoring_data.values()], + plt.plot(keys, + [value.get("pool_utilisation", None) for value in values], label="Pool utilisation") - plt.plot(globals.monitoring_data.keys(), - [value.get("pool_allocation", None) for value in globals.monitoring_data.values()], + plt.plot(keys, + [value.get("pool_allocation", None) for value in values], label="Pool allocation") - plt.plot(globals.monitoring_data.keys(), - [value.get("empty_drones", None) for value in globals.monitoring_data.values()], + plt.plot(keys, + [value.get("empty_drones", None) for value in values], label="Unallocated drones") plt.legend() plt.show() for index, pool in enumerate(globals.pools): print("pool", index, "has", pool.resources) - plt.plot(globals.monitoring_data.keys(), - [value.get("pool_%s_supply" % pool, None) for value in globals.monitoring_data.values()], + plt.plot(keys, + [value.get("pool_%s_supply" % pool, None) for value in values], label="Pool %d supply" % index) plt.legend() plt.show() fig, ax1 = plt.subplots() - ax1.plot(globals.monitoring_data.keys(), - [value.get("cost", None) for value in globals.monitoring_data.values()], 'b-') + ax1.plot(keys, + [value.get("cost", None) for value in values], 'b-') ax1.set_xlabel('Time') # Make the y-axis label, ticks and tick labels match the line color. ax1.set_ylabel('Cost', color='b') ax1.tick_params('y', colors='b') ax2 = ax1.twinx() - ax2.plot(globals.monitoring_data.keys(), - [value.get("acc_cost", None) for value in globals.monitoring_data.values()], 'r.') + ax2.plot(keys, + [value.get("acc_cost", None) for value in values], 'r.') ax2.set_ylabel('Accumulated Cost', color='r') ax2.tick_params('y', colors='r') @@ -136,27 +148,32 @@ def generate_plots(): # resource plot for max fig, ax = plt.subplots(2, sharex=True) - ax[0].plot(globals.monitoring_data.keys(), - [value.get("unused_resources", None) for value in globals.monitoring_data.values()], + ax[0].plot(keys, + [value.get("unused_resources", None) for value in values], label="Unused") - ax[0].plot(globals.monitoring_data.keys(), - [value.get("used_resources", None) for value in globals.monitoring_data.values()], + ax[0].plot(keys, + [value.get("used_resources", None) for value in values], label="Used") ax[0].set_title("Resource utilisation") ax[0].legend() percentages = [] percentage_means = [] - for value in globals.monitoring_data.values(): + for value in values: try: percentages.append(value.get("unused_resources", 0) / value.get("available_resources", 0)) except ZeroDivisionError: percentages.append(1) percentage_means.append(sum(percentages) / len(percentages)) - ax[1].plot(globals.monitoring_data.keys(), percentages) - ax[1].plot(globals.monitoring_data.keys(), percentage_means, label="mean") + ax[1].plot(keys, percentages) + ax[1].plot(keys, percentage_means, label="mean") ax[1].set_title("Percentage of unused resources") fig.show() + # waiting time histogram + plt.hist(globals.monitoring_data["job_waiting_times"], label="Job waiting times") + plt.legend() + plt.show() + def main(): monitor_data = partial(monitor, globals.monitoring_data) @@ -176,7 +193,7 @@ def main(): env.run(until=2000) generate_plots() - print("final cost: %.2f" % globals.monitoring_data[sorted(globals.monitoring_data.keys())[-1]]["acc_cost"]) + print("final cost: %.2f" % globals.monitoring_data["timesteps"][sorted(globals.monitoring_data["timesteps"].keys())[-1]]["acc_cost"]) if __name__ == "__main__": From a67ab7c90824f3a29d8aba10576faed92648c543 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Thu, 11 Oct 2018 13:22:19 +0200 Subject: [PATCH 040/648] fixed access to pool resources --- drone.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drone.py b/drone.py index 6b16449..d30c88a 100644 --- a/drone.py +++ b/drone.py @@ -46,7 +46,7 @@ def allocation(self): def _init_allocation_and_utilisation(self): resources = [] for resource_key, value in self.resources.items(): - resources.append(value / self.pool.resources[resource_key]) + resources.append(value / self.pool_resources[resource_key]) self._allocation = max(resources) self._utilisation = min(resources) From 87f2e29354b6d68dec59fba71a15c4c2ac016e46 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Thu, 11 Oct 2018 16:32:00 +0200 Subject: [PATCH 041/648] added functionality to kill jobs when they are exceeding available resources, fixes issue #2 --- drone.py | 33 +++++++++++++++++++-------------- job.py | 17 ++++++++++++----- 2 files changed, 31 insertions(+), 19 deletions(-) diff --git a/drone.py b/drone.py index d30c88a..0e93f1a 100644 --- a/drone.py +++ b/drone.py @@ -10,6 +10,8 @@ def __init__(self, env, pool_resources, scheduling_duration): self.pool_resources = pool_resources self.action = env.process(self.run(scheduling_duration)) self.resources = {resource: 0 for resource in self.pool_resources} + # shadowing requested resources to determine jobs to be killed + self.used_resources = {resource: 0 for resource in self.pool_resources} self._supply = 0 self.jobs = 0 self._allocation = None @@ -56,32 +58,35 @@ def shutdown(self): # print("[drone %s] has been shut down" % self) def start_job(self, job, kill=False): + """ + Method manages to start a job in the context of the given drone. + The job is started independent of available resources. If resources of drone are exceeded, the job is killed. + + :param job: the job to start + :param kill: if True, a job is killed when used resources exceed requested resources + :return: + """ + self._utilisation = None + self._allocation = None + self.jobs += 1 + job_execution = job.process() for resource_key in job.resources: - if self.resources[resource_key] + job.resources[resource_key]: - # TODO: kill job - pass + if self.used_resources[resource_key] + job.used_resources[resource_key] > self.pool_resources[resource_key]: + job.kill() if job.resources[resource_key] < job.used_resources[resource_key]: if kill: job.kill() else: pass - return - if self.resources[resource_key] + job.resources[resource_key] > self.pool_resources[resource_key] and kill: - if kill: - job.kill() - else: - pass - return - self._utilisation = None - self._allocation = None for resource_key in job.resources: self.resources[resource_key] += job.resources[resource_key] - self.jobs += 1 - yield from job.process() + self.used_resources[resource_key] += job.used_resources[resource_key] + yield job_execution self.jobs -= 1 self._utilisation = None self._allocation = None for resource_key in job.resources: self.resources[resource_key] -= job.resources[resource_key] + self.used_resources[resource_key] -= job.used_resources[resource_key] # put drone back into pool queue # print("[drone %s] finished job at %d" % (self, self.env.now)) diff --git a/job.py b/job.py index ab28b90..c8ec209 100644 --- a/job.py +++ b/job.py @@ -1,6 +1,7 @@ import random import math import csv +import simpy import globals @@ -42,6 +43,7 @@ def __init__(self, env, walltime, resources, used_resources=None, in_queue_since self.walltime = float(walltime) self.in_queue_since = in_queue_since self.in_queue_until = None + self.processing = None @property def waiting_time(self): @@ -51,13 +53,18 @@ def waiting_time(self): def process(self): self.in_queue_until = self.env.now - # print(self, "starting job at", self.env.now, "with duration", self.walltime) - yield self.env.timeout(self.walltime, value=self) - # print(self, "job finished", self.env.now) + self.processing = self.env.process(self._process()) + return self.processing + + def _process(self): + try: + yield self.env.timeout(self.walltime, value=self) + except simpy.exceptions.Interrupt: + pass def kill(self): # job exceeds either own requested resources or resources provided by drone - print("----> killing job", self) + self.processing.interrupt(cause=self) def job_property_generator(**kwargs): @@ -89,7 +96,7 @@ def htcondor_export_job_generator(filename, job_queue, env=None, **kwargs): "cores": float(row[header.index("RemoteSysCpu")]) + float(row[header.index("RemoteUserCpu")]) / float(row[header.index("RemoteWallClockTime")]), "memory": float(row[header.index("MemoryUsage")]), - "disk": int(row[header.index("DiskUsage_RAW")]) + # "disk": int(row[header.index("DiskUsage_RAW")]) }, in_queue_since=env.now)) row = None else: From cb75aed4ddec4d066caa3a9b13e843bb40889b70 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Thu, 11 Oct 2018 16:32:29 +0200 Subject: [PATCH 042/648] added plot for visualisation of exceeded resources of jobs, related to issue #2 --- main.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/main.py b/main.py index 54f97b3..0dcab20 100644 --- a/main.py +++ b/main.py @@ -28,8 +28,16 @@ def tracing_step(): def monitor(data, t, prio, eid, event, resource_normalisation): - print(event) if event.value: + if isinstance(event.value, simpy.exceptions.Interrupt): + job = event.value.cause + for resource_key, usage in job.used_resources.items(): + value = job.resources[resource_key] / usage + if value > 1: + try: + globals.monitoring_data["job_exceeds_%s" % resource_key].append(value) + except AttributeError: + globals.monitoring_data["job_exceeds_%s" % resource_key] = [value] if isinstance(event.value, Job): try: globals.monitoring_data["job_waiting_times"].append(event.value.waiting_time) @@ -175,6 +183,13 @@ def generate_plots(): plt.legend() plt.show() + for resource_key in [key for key in globals.monitoring_data.keys() if + isinstance(key, str) and key.startswith("job_exceeds_")]: + plt.hist(globals.monitoring_data[resource_key], label="Job exceeding %s" % + resource_key.replace("job_exceeds_", "")) + plt.legend() + plt.show() + def main(): monitor_data = partial(monitor, globals.monitoring_data) From 1a8459cb6bdb29e46adea23703ae79c184cce189 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Thu, 11 Oct 2018 16:38:56 +0200 Subject: [PATCH 043/648] removing unused import --- drone.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/drone.py b/drone.py index 0e93f1a..12a99f9 100644 --- a/drone.py +++ b/drone.py @@ -1,7 +1,5 @@ from cobald import interfaces -from job import Job - class Drone(interfaces.Pool): def __init__(self, env, pool_resources, scheduling_duration): From 78c898a41f2f8ef11a5d7ecab3503faaf40cedff Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Thu, 11 Oct 2018 17:00:46 +0200 Subject: [PATCH 044/648] added definition for static pool, solves issue #1 --- pool.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/pool.py b/pool.py index bee8aab..c6cc9dc 100644 --- a/pool.py +++ b/pool.py @@ -80,3 +80,17 @@ def demand(self, value): self._demand = value else: self._demand = 0 + + +class StaticPool(Pool): + def __init__(self, env, init=0, resources={"memory": 8, "cores": 1}): + assert init > 0, "Static pool was initialised without any resources..." + super(StaticPool, self).__init__(env, capacity=init, init=init, resources=resources) + self._demand = init + for _ in range(init): + self._drones.append(Drone(self.env, self.resources, 0)) + self.put(init) + + def run(self): + while True: + yield self.env.timeout(float("Inf")) From af82b2974823c671e5f024669d4522b64c919def Mon Sep 17 00:00:00 2001 From: Sven Lange Date: Sun, 14 Oct 2018 20:57:48 +0200 Subject: [PATCH 045/648] fixed calculation of exceeded resources of jobs --- main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.py b/main.py index 0dcab20..6908284 100644 --- a/main.py +++ b/main.py @@ -32,7 +32,7 @@ def monitor(data, t, prio, eid, event, resource_normalisation): if isinstance(event.value, simpy.exceptions.Interrupt): job = event.value.cause for resource_key, usage in job.used_resources.items(): - value = job.resources[resource_key] / usage + value = usage / job.resources[resource_key] if value > 1: try: globals.monitoring_data["job_exceeds_%s" % resource_key].append(value) From 85105d39ccc888cb539f97944ce6ddbb81938854 Mon Sep 17 00:00:00 2001 From: "eileen.kuehn" Date: Sat, 20 Oct 2018 15:21:56 +0200 Subject: [PATCH 046/648] Add LICENSE --- LICENSE | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 LICENSE diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..392dcce --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2018 Eileen Kuehn + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. From 9b725e7cb851082dcab640c2c7e16040b64e754e Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Sun, 21 Oct 2018 09:19:18 +0200 Subject: [PATCH 047/648] monitored data is now logged to two logging handlers, fixes issue #8 --- globals.py | 1 - job.py | 6 ++++-- main.py | 47 ++++++++++++++++++++++++++++------------------- 3 files changed, 32 insertions(+), 22 deletions(-) diff --git a/globals.py b/globals.py index 78f6476..e7c0f9d 100644 --- a/globals.py +++ b/globals.py @@ -4,5 +4,4 @@ job_queue = [] job_generator = None job_scheduler = None -monitoring_data = defaultdict(dict) # {tme: {variable: value, ...}} cost = 0 diff --git a/job.py b/job.py index c8ec209..cc2ecff 100644 --- a/job.py +++ b/job.py @@ -2,6 +2,7 @@ import math import csv import simpy +import logging import globals @@ -31,7 +32,7 @@ def job_demand(env): value = round(value) if value > 0: globals.global_demand.put(value) - globals.monitoring_data[round(env.now)]["user_demand_new"] = value + logging.getLogger("general").info(str(round(env.now)), {"user_demand_new": value}) # print("[demand] raising user demand for %f at %d to %d" % (value, env.now, globals.global_demand.level)) @@ -99,8 +100,9 @@ def htcondor_export_job_generator(filename, job_queue, env=None, **kwargs): # "disk": int(row[header.index("DiskUsage_RAW")]) }, in_queue_since=env.now)) row = None + current_time = 0 else: if count > 0: - globals.monitoring_data[round(env.now)]["user_demand_new"] = count + logging.getLogger("general").info(str(round(env.now)), {"user_demand_new": count}) count = 0 yield env.timeout(1) diff --git a/main.py b/main.py index 6908284..2de8f18 100644 --- a/main.py +++ b/main.py @@ -1,6 +1,9 @@ from functools import partial, wraps import simpy import random +import logging.handlers + +from cobald.monitor.format_json import JsonFormatter import matplotlib.pyplot as plt @@ -8,10 +11,27 @@ from cost import cobald_cost from job import job_demand, htcondor_export_job_generator, Job from scheduler import CondorJobScheduler -from pool import Pool +from pool import Pool, StaticPool from controller import SimulatedCostController +class JSONSocketHandler(logging.handlers.SocketHandler): + def makePickle(self, record): + return self.format(record).encode() + + +monitoring_logger = logging.getLogger("general") +monitoring_logger.setLevel(logging.DEBUG) +socketHandler = JSONSocketHandler( + 'localhost', + logging.handlers.DEFAULT_TCP_LOGGING_PORT) +streamHandler = logging.StreamHandler() +socketHandler.setFormatter(JsonFormatter()) +streamHandler.setFormatter(JsonFormatter()) +monitoring_logger.addHandler(socketHandler) +monitoring_logger.addHandler(streamHandler) + + def trace(env, callback, resource_normalisation): def get_wrapper(env_step, callback): @wraps(env_step) @@ -34,15 +54,9 @@ def monitor(data, t, prio, eid, event, resource_normalisation): for resource_key, usage in job.used_resources.items(): value = usage / job.resources[resource_key] if value > 1: - try: - globals.monitoring_data["job_exceeds_%s" % resource_key].append(value) - except AttributeError: - globals.monitoring_data["job_exceeds_%s" % resource_key] = [value] + monitoring_logger.info(str(round(t)), {"job_exceeds_%s" % resource_key: value}) if isinstance(event.value, Job): - try: - globals.monitoring_data["job_waiting_times"].append(event.value.waiting_time) - except AttributeError: - globals.monitoring_data["job_waiting_times"] = [event.value.waiting_time] + monitoring_logger.info(str(round(t)), {"job_waiting_times": event.value.waiting_time}) global last_step if t > last_step: # new data to be recorded @@ -61,7 +75,7 @@ def monitor(data, t, prio, eid, event, resource_normalisation): for pool in globals.pools: pool_demand += pool.demand pool_supply += pool.supply - result["pool_%s_supply" % pool] = pool.supply + result["pool_%s_supply" % id(pool)] = pool.supply pool_utilisation += pool.utilisation pool_allocation += pool.allocation for drone in pool.drones: @@ -87,13 +101,7 @@ def monitor(data, t, prio, eid, event, resource_normalisation): result["cost"] = cost globals.cost += cost result["acc_cost"] = globals.cost - monitoring_data = globals.monitoring_data["timesteps"] - try: - monitoring_data[tmp].update(result) - except KeyError: - monitoring_data[tmp] = result - # print("%s [Pool %s] drones %d, demand %d, supply %d (%d); allocation %.2f, utilisation %.2f" % ( - # tmp, pool, len(pool.drones), pool.demand, pool.supply, pool.level, pool.allocation, pool.utilisation)) + monitoring_logger.info(str(tmp), result) def generate_plots(): @@ -192,11 +200,12 @@ def generate_plots(): def main(): - monitor_data = partial(monitor, globals.monitoring_data) + resource_normalisation = {"memory": 2000} + monitor_data = partial(monitor, resource_normalisation) random.seed(1234) env = simpy.Environment() - trace(env, monitor_data, resource_normalisation={"memory": 2000}) + trace(env, monitor_data, resource_normalisation=resource_normalisation) globals.job_generator = htcondor_export_job_generator(filename="condor_usage_sorted_filtered.csv", job_queue=globals.job_queue, env=env) From 5357837f7b46b760e0b13257a03b27b4e29b96a9 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Sun, 21 Oct 2018 09:43:43 +0200 Subject: [PATCH 048/648] correct calculation of used cpu cores, fixes issue #12 --- job.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/job.py b/job.py index cc2ecff..3f63086 100644 --- a/job.py +++ b/job.py @@ -94,7 +94,7 @@ def htcondor_export_job_generator(filename, job_queue, env=None, **kwargs): # "disk": int(row[header.index("RequestDisk")]), "memory": float(row[header.index("RequestMemory")]) }, used_resources={ - "cores": float(row[header.index("RemoteSysCpu")]) + float(row[header.index("RemoteUserCpu")]) / + "cores": (float(row[header.index("RemoteSysCpu")]) + float(row[header.index("RemoteUserCpu")])) / float(row[header.index("RemoteWallClockTime")]), "memory": float(row[header.index("MemoryUsage")]), # "disk": int(row[header.index("DiskUsage_RAW")]) From ac11e2bf4fcdd649b8640d6ffba5203979e3ca25 Mon Sep 17 00:00:00 2001 From: Sven Lange Date: Wed, 17 Oct 2018 21:12:25 +0200 Subject: [PATCH 049/648] make simulation time and input file configurable --- main.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/main.py b/main.py index 2de8f18..b8d8fff 100644 --- a/main.py +++ b/main.py @@ -199,14 +199,14 @@ def generate_plots(): plt.show() -def main(): +def main(filename="condor_usage_sorted_filtered.csv", until=2000): resource_normalisation = {"memory": 2000} monitor_data = partial(monitor, resource_normalisation) random.seed(1234) env = simpy.Environment() trace(env, monitor_data, resource_normalisation=resource_normalisation) - globals.job_generator = htcondor_export_job_generator(filename="condor_usage_sorted_filtered.csv", + globals.job_generator = htcondor_export_job_generator(filename=filename, job_queue=globals.job_queue, env=env) env.process(globals.job_generator) @@ -215,7 +215,7 @@ def main(): globals.pools.append(pool) SimulatedCostController(env, target=pool, rate=1) globals.job_scheduler = CondorJobScheduler(env=env, job_queue=globals.job_queue) - env.run(until=2000) + env.run(until=until) generate_plots() print("final cost: %.2f" % globals.monitoring_data["timesteps"][sorted(globals.monitoring_data["timesteps"].keys())[-1]]["acc_cost"]) From 8eef7f807368523cac4b4fbf7e5804fbcfaf4479 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Sun, 21 Oct 2018 16:29:53 +0200 Subject: [PATCH 050/648] removed accesses to monitoring data --- main.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/main.py b/main.py index b8d8fff..63d20d8 100644 --- a/main.py +++ b/main.py @@ -214,12 +214,10 @@ def main(filename="condor_usage_sorted_filtered.csv", until=2000): pool = Pool(env, resources=resources) globals.pools.append(pool) SimulatedCostController(env, target=pool, rate=1) + globals.pools.append(StaticPool(env, init=2)) globals.job_scheduler = CondorJobScheduler(env=env, job_queue=globals.job_queue) env.run(until=until) - generate_plots() - print("final cost: %.2f" % globals.monitoring_data["timesteps"][sorted(globals.monitoring_data["timesteps"].keys())[-1]]["acc_cost"]) - if __name__ == "__main__": main() From b7f37413caf56b4d52dfaa5f7c26d8c7cc4b7af6 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Sun, 21 Oct 2018 23:30:32 +0200 Subject: [PATCH 051/648] added method to read static pools from condor export, fixes issue #7 --- pool_io/__init__.py | 0 pool_io/htcondor.py | 23 +++++++++++++++++++++++ 2 files changed, 23 insertions(+) create mode 100644 pool_io/__init__.py create mode 100644 pool_io/htcondor.py diff --git a/pool_io/__init__.py b/pool_io/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/pool_io/htcondor.py b/pool_io/htcondor.py new file mode 100644 index 0000000..3745b61 --- /dev/null +++ b/pool_io/htcondor.py @@ -0,0 +1,23 @@ +import pandas as pd + +from pool import StaticPool + + +def htcondor_pool_reader(env, iterable, resource_name_mapping={ + "TotalSlotCPUs": "cores", + "TotalSlotDisk": "disk", + "TotalSlotMemory": "memory" +}): + """ + Load a pool configuration that was exported via htcondor from files or iterables + + :param iterable: an iterable yielding lines of CSV, such as an open file + :param resource_name_mapping: Mapping from given header names to well-defined resources in simulation + :return: Yields the :py:class:`StaticPool`s found in the given iterable + """ + df = pd.read_csv(iterable, sep='\s{1,}', header=0, engine='python', thousands=',') + df = df.rename(columns=resource_name_mapping) + header = list(df.columns.values) + for row_idx, *row in df.itertuples(): + yield StaticPool(env, init=row[0], resources={key: row[header.index(key)] for key in + resource_name_mapping.values()}) From 35c5ba5fc33aaa88063a8f9722e461c5bce199ae Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Tue, 23 Oct 2018 19:56:10 +0200 Subject: [PATCH 052/648] removed dependency on pandas for reading of csv --- pool_io/htcondor.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/pool_io/htcondor.py b/pool_io/htcondor.py index 3745b61..f6aa7aa 100644 --- a/pool_io/htcondor.py +++ b/pool_io/htcondor.py @@ -1,4 +1,4 @@ -import pandas as pd +import csv from pool import StaticPool @@ -15,9 +15,10 @@ def htcondor_pool_reader(env, iterable, resource_name_mapping={ :param resource_name_mapping: Mapping from given header names to well-defined resources in simulation :return: Yields the :py:class:`StaticPool`s found in the given iterable """ - df = pd.read_csv(iterable, sep='\s{1,}', header=0, engine='python', thousands=',') - df = df.rename(columns=resource_name_mapping) - header = list(df.columns.values) - for row_idx, *row in df.itertuples(): - yield StaticPool(env, init=row[0], resources={key: row[header.index(key)] for key in - resource_name_mapping.values()}) + reader = csv.reader(iterable, delimiter=' ', skipinitialspace=True) + first_line = next(reader) + for row_idx, row in enumerate(reader): + yield StaticPool( + env, + init=int(row[0]), + resources={value: row[first_line.index(key)] for key, value in resource_name_mapping.items()}) From 535c4a0893bb566072eaa8dc4dc3c8aac34b96ca Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Tue, 23 Oct 2018 21:14:09 +0200 Subject: [PATCH 053/648] restructured project to proper python module --- {pool_io => cobald_sim}/__init__.py | 0 controller.py => cobald_sim/controller.py | 0 cost.py => cobald_sim/cost.py | 0 drone.py => cobald_sim/drone.py | 0 globals.py => cobald_sim/globals.py | 0 job.py => cobald_sim/job.py | 0 pool.py => cobald_sim/pool.py | 0 cobald_sim/pool_io/__init__.py | 0 {pool_io => cobald_sim/pool_io}/htcondor.py | 0 scheduler.py => cobald_sim/scheduler.py | 0 10 files changed, 0 insertions(+), 0 deletions(-) rename {pool_io => cobald_sim}/__init__.py (100%) rename controller.py => cobald_sim/controller.py (100%) rename cost.py => cobald_sim/cost.py (100%) rename drone.py => cobald_sim/drone.py (100%) rename globals.py => cobald_sim/globals.py (100%) rename job.py => cobald_sim/job.py (100%) rename pool.py => cobald_sim/pool.py (100%) create mode 100644 cobald_sim/pool_io/__init__.py rename {pool_io => cobald_sim/pool_io}/htcondor.py (100%) rename scheduler.py => cobald_sim/scheduler.py (100%) diff --git a/pool_io/__init__.py b/cobald_sim/__init__.py similarity index 100% rename from pool_io/__init__.py rename to cobald_sim/__init__.py diff --git a/controller.py b/cobald_sim/controller.py similarity index 100% rename from controller.py rename to cobald_sim/controller.py diff --git a/cost.py b/cobald_sim/cost.py similarity index 100% rename from cost.py rename to cobald_sim/cost.py diff --git a/drone.py b/cobald_sim/drone.py similarity index 100% rename from drone.py rename to cobald_sim/drone.py diff --git a/globals.py b/cobald_sim/globals.py similarity index 100% rename from globals.py rename to cobald_sim/globals.py diff --git a/job.py b/cobald_sim/job.py similarity index 100% rename from job.py rename to cobald_sim/job.py diff --git a/pool.py b/cobald_sim/pool.py similarity index 100% rename from pool.py rename to cobald_sim/pool.py diff --git a/cobald_sim/pool_io/__init__.py b/cobald_sim/pool_io/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/pool_io/htcondor.py b/cobald_sim/pool_io/htcondor.py similarity index 100% rename from pool_io/htcondor.py rename to cobald_sim/pool_io/htcondor.py diff --git a/scheduler.py b/cobald_sim/scheduler.py similarity index 100% rename from scheduler.py rename to cobald_sim/scheduler.py From 14734ca0b244e35a90aae2321ca0c9a4619257e6 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Tue, 23 Oct 2018 22:05:27 +0200 Subject: [PATCH 054/648] fixed imports after restructuring --- cobald_sim/controller.py | 2 -- cobald_sim/cost.py | 2 +- cobald_sim/globals.py | 2 -- cobald_sim/job.py | 3 --- cobald_sim/pool.py | 2 +- cobald_sim/pool_io/htcondor.py | 2 +- cobald_sim/scheduler.py | 2 +- main.py | 19 ++++++++++--------- 8 files changed, 14 insertions(+), 20 deletions(-) diff --git a/cobald_sim/controller.py b/cobald_sim/controller.py index d2514bb..d6198d4 100644 --- a/cobald_sim/controller.py +++ b/cobald_sim/controller.py @@ -2,8 +2,6 @@ from cobald.controller.relative_supply import RelativeSupplyController from cobald.interfaces import Pool -from cost import local_cobald_cost - class SimulatedLinearController(LinearController): def __init__(self, env, target: Pool, low_utilisation=0.5, high_allocation=0.5, rate=1, interval=1): diff --git a/cobald_sim/cost.py b/cobald_sim/cost.py index f22918c..69059d0 100644 --- a/cobald_sim/cost.py +++ b/cobald_sim/cost.py @@ -1,4 +1,4 @@ -import globals +from cobald_sim import globals def cobald_cost(): diff --git a/cobald_sim/globals.py b/cobald_sim/globals.py index e7c0f9d..e1ac98c 100644 --- a/cobald_sim/globals.py +++ b/cobald_sim/globals.py @@ -1,5 +1,3 @@ -from collections import defaultdict - pools = [] job_queue = [] job_generator = None diff --git a/cobald_sim/job.py b/cobald_sim/job.py index 3f63086..71f3279 100644 --- a/cobald_sim/job.py +++ b/cobald_sim/job.py @@ -1,11 +1,8 @@ import random import math -import csv import simpy import logging -import globals - def job_demand(env): """ diff --git a/cobald_sim/pool.py b/cobald_sim/pool.py index c6cc9dc..ad83df0 100644 --- a/cobald_sim/pool.py +++ b/cobald_sim/pool.py @@ -1,7 +1,7 @@ from simpy.resources import container from cobald import interfaces -from drone import Drone +from .drone import Drone class Pool(interfaces.Pool, container.Container): diff --git a/cobald_sim/pool_io/htcondor.py b/cobald_sim/pool_io/htcondor.py index f6aa7aa..21c02e6 100644 --- a/cobald_sim/pool_io/htcondor.py +++ b/cobald_sim/pool_io/htcondor.py @@ -1,6 +1,6 @@ import csv -from pool import StaticPool +from ..pool import StaticPool def htcondor_pool_reader(env, iterable, resource_name_mapping={ diff --git a/cobald_sim/scheduler.py b/cobald_sim/scheduler.py index 8e750bf..8dccb9c 100644 --- a/cobald_sim/scheduler.py +++ b/cobald_sim/scheduler.py @@ -1,4 +1,4 @@ -import globals +from cobald_sim import globals # TODO: does not work anymore as there is no method get_drone at pool diff --git a/main.py b/main.py index 63d20d8..58bb26f 100644 --- a/main.py +++ b/main.py @@ -7,12 +7,13 @@ import matplotlib.pyplot as plt -import globals -from cost import cobald_cost -from job import job_demand, htcondor_export_job_generator, Job -from scheduler import CondorJobScheduler -from pool import Pool, StaticPool -from controller import SimulatedCostController +from cobald_sim import globals + +from cobald_sim.cost import cobald_cost +from cobald_sim.job import htcondor_export_job_generator, Job +from cobald_sim.scheduler import CondorJobScheduler +from cobald_sim.pool import Pool, StaticPool +from cobald_sim.controller import SimulatedCostController class JSONSocketHandler(logging.handlers.SocketHandler): @@ -97,9 +98,9 @@ def monitor(data, t, prio, eid, event, resource_normalisation): result["used_resources"] = used_resources result["unused_resources"] = unused_resources result["available_resources"] = available_resources - cost = cobald_cost() - result["cost"] = cost - globals.cost += cost + current_cost = cobald_cost() + result["cost"] = current_cost + globals.cost += current_cost result["acc_cost"] = globals.cost monitoring_logger.info(str(tmp), result) From 7ea0bc483ac46f9dceba59d35ba37dab64f37594 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Tue, 23 Oct 2018 22:08:34 +0200 Subject: [PATCH 055/648] improved import of htcondor jobs, related to issue #11 --- cobald_sim/job.py | 34 +++++++++++++--------------------- cobald_sim/job_io/__init__.py | 0 cobald_sim/job_io/htcondor.py | 33 +++++++++++++++++++++++++++++++++ 3 files changed, 46 insertions(+), 21 deletions(-) create mode 100644 cobald_sim/job_io/__init__.py create mode 100644 cobald_sim/job_io/htcondor.py diff --git a/cobald_sim/job.py b/cobald_sim/job.py index 71f3279..08e79e0 100644 --- a/cobald_sim/job.py +++ b/cobald_sim/job.py @@ -34,11 +34,12 @@ def job_demand(env): class Job(object): - def __init__(self, env, walltime, resources, used_resources=None, in_queue_since=0): + def __init__(self, env, walltime, resources, used_resources=None, in_queue_since=0, schedule_date=0): self.env = env self.resources = resources self.used_resources = used_resources self.walltime = float(walltime) + self.schedule_date = schedule_date self.in_queue_since = in_queue_since self.in_queue_until = None self.processing = None @@ -71,33 +72,24 @@ def job_property_generator(**kwargs): def htcondor_export_job_generator(filename, job_queue, env=None, **kwargs): + from .job_io.htcondor import htcondor_job_reader + with open(filename, "r") as input_file: - htcondor_reader = csv.reader(input_file, delimiter=' ', quotechar="'") - header = next(htcondor_reader) - row = next(htcondor_reader) - base_date = float(row[header.index("QDate")]) + reader = htcondor_job_reader(env, input_file) + job = next(reader) + base_date = job.schedule_date current_time = 0 count = 0 while True: - if not row: - row = next(htcondor_reader) - current_time = float(row[header.index("QDate")]) - base_date + if not job: + job = next(reader) + current_time = job.schedule_date - base_date if env.now >= current_time: count += 1 - job_queue.append(Job( - env, row[header.index("RemoteWallClockTime")], resources={ - "cores": int(row[header.index("RequestCpus")]), - # "disk": int(row[header.index("RequestDisk")]), - "memory": float(row[header.index("RequestMemory")]) - }, used_resources={ - "cores": (float(row[header.index("RemoteSysCpu")]) + float(row[header.index("RemoteUserCpu")])) / - float(row[header.index("RemoteWallClockTime")]), - "memory": float(row[header.index("MemoryUsage")]), - # "disk": int(row[header.index("DiskUsage_RAW")]) - }, in_queue_since=env.now)) - row = None - current_time = 0 + job.in_queue_since = env.now + job_queue.append(job) + job = None else: if count > 0: logging.getLogger("general").info(str(round(env.now)), {"user_demand_new": count}) diff --git a/cobald_sim/job_io/__init__.py b/cobald_sim/job_io/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/cobald_sim/job_io/htcondor.py b/cobald_sim/job_io/htcondor.py new file mode 100644 index 0000000..3016c46 --- /dev/null +++ b/cobald_sim/job_io/htcondor.py @@ -0,0 +1,33 @@ +import csv + +from cobald_sim.job import Job + + +def htcondor_job_reader(env, iterable, resource_name_mapping={ + "cores": "RequestCpus", + "walltime": "RemoteWallClockTime", + "memory": "RequestMemory", + "disk": "RequestDisk" +}, used_resource_name_mapping={ + "scheduletime": "QDate", + "walltime": "RemoteWallClockTime", + "cores": "Number of Allocated Processors", + "memory": "MemoryUsage", + "disk": "DiskUsage_RAW" +}): + htcondor_reader = csv.DictReader(iterable, delimiter=' ', quotechar="'") + + for row in htcondor_reader: + yield Job( + env, + walltime=row[resource_name_mapping["walltime"]], + resources={ + "cores": int(row[resource_name_mapping["cores"]]), + # "disk": float(row[resource_name_mapping["disk"]]), + "memory": float(row[resource_name_mapping["memory"]]) + }, used_resources={ + "cores": (float(row["RemoteSysCpu"]) + float(row["RemoteUserCpu"])) / + float(row[used_resource_name_mapping["walltime"]]), + "memory": float(row[used_resource_name_mapping["memory"]]), + # "disk": float(row[used_resource_name_mapping["disk"]]) + }, schedule_date=float(row[used_resource_name_mapping["scheduletime"]])) From 40252808180c4dafd53c51acd4c1d4cb428bd741 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Tue, 23 Oct 2018 22:23:20 +0200 Subject: [PATCH 056/648] improved reading of pools from htcondor --- cobald_sim/pool_io/htcondor.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/cobald_sim/pool_io/htcondor.py b/cobald_sim/pool_io/htcondor.py index 21c02e6..db1125b 100644 --- a/cobald_sim/pool_io/htcondor.py +++ b/cobald_sim/pool_io/htcondor.py @@ -4,9 +4,9 @@ def htcondor_pool_reader(env, iterable, resource_name_mapping={ - "TotalSlotCPUs": "cores", - "TotalSlotDisk": "disk", - "TotalSlotMemory": "memory" + "cores": "TotalSlotCPUs", + "disk": "TotalSlotDisk", + "memory": "TotalSlotMemory" }): """ Load a pool configuration that was exported via htcondor from files or iterables @@ -15,10 +15,9 @@ def htcondor_pool_reader(env, iterable, resource_name_mapping={ :param resource_name_mapping: Mapping from given header names to well-defined resources in simulation :return: Yields the :py:class:`StaticPool`s found in the given iterable """ - reader = csv.reader(iterable, delimiter=' ', skipinitialspace=True) - first_line = next(reader) + reader = csv.DictReader(iterable, delimiter=' ', skipinitialspace=True) for row_idx, row in enumerate(reader): yield StaticPool( env, - init=int(row[0]), - resources={value: row[first_line.index(key)] for key, value in resource_name_mapping.items()}) + init=int(row["Count"]), + resources={key: row[value] for key, value in resource_name_mapping.items()}) From 8facb11295683091555388f929e06440b026c28e Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Tue, 23 Oct 2018 22:56:38 +0200 Subject: [PATCH 057/648] added implementation for standard workflow format, related to issue #11 --- cobald_sim/job_io/swf.py | 49 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) create mode 100644 cobald_sim/job_io/swf.py diff --git a/cobald_sim/job_io/swf.py b/cobald_sim/job_io/swf.py new file mode 100644 index 0000000..4881144 --- /dev/null +++ b/cobald_sim/job_io/swf.py @@ -0,0 +1,49 @@ +import csv + +from cobald_sim.job import Job + + +def swf_job_reader(env, iterable, resource_name_mapping={ + "cores": "Requested Number of Processors", + "walltime": "Requested Time", + "memory": "Requested Memory" +}, used_resource_name_mapping={ + "walltime": "Run Time", + "cores": "Number of Allocated Processors", + "memory": "Used Memory", + "scheduletime": "Submit Time" +}): + header = { + "Job Number": 0, + "Submit Time": 1, + "Wait Time": 2, + "Run Time": 3, + "Number of Allocated Processors": 4, + "Average CPU Time Used": 5, + "Used Memory": 6, + "Requested Number of Processors": 7, + "Requested Time": 8, + "Requested Memory": 9, + "Status": 10, + "User ID": 11, + "Group ID": 12, + "Executable (Application) Number": 13, + "Queue Number": 14, + "Partition Number": 15, + "Preceding Job Number": 16, + "Think Time from Preceding Job": 17 + } + reader = csv.reader((line for line in iterable if line[0] != ';'), delimiter=' ', skipinitialspace=True) + for row in reader: + yield Job( + env, + walltime=row[header[resource_name_mapping["walltime"]]], + resources={ + "cores": row[header[resource_name_mapping["cores"]]], + "memory": row[header[resource_name_mapping["memory"]]] + }, + used_resources={ + "cores": row[header[used_resource_name_mapping["cores"]]], + "memory": row[header[used_resource_name_mapping["memory"]]] + }, schedule_date=float(row[header[used_resource_name_mapping["scheduletime"]]])) + From f1fd815214d6a0f2615d46161e20938a11e54582 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Wed, 24 Oct 2018 07:12:04 +0200 Subject: [PATCH 058/648] import for standard workfload format now excludes negative values, fixes issue #11 --- cobald_sim/job_io/swf.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/cobald_sim/job_io/swf.py b/cobald_sim/job_io/swf.py index 4881144..350f7b5 100644 --- a/cobald_sim/job_io/swf.py +++ b/cobald_sim/job_io/swf.py @@ -39,11 +39,12 @@ def swf_job_reader(env, iterable, resource_name_mapping={ env, walltime=row[header[resource_name_mapping["walltime"]]], resources={ - "cores": row[header[resource_name_mapping["cores"]]], - "memory": row[header[resource_name_mapping["memory"]]] + key: float(row[header[resource_name_mapping[key]]]) + for key in ("cores", "memory") + if float(row[header[resource_name_mapping[key]]]) >= 0 }, used_resources={ - "cores": row[header[used_resource_name_mapping["cores"]]], - "memory": row[header[used_resource_name_mapping["memory"]]] + key: float(row[header[used_resource_name_mapping[key]]]) + for key in ("cores", "memory") + if float(row[header[used_resource_name_mapping[key]]]) >= 0 }, schedule_date=float(row[header[used_resource_name_mapping["scheduletime"]]])) - From ddd64ee67964d7afc59f9d9ff6e3450dc3ec1b4d Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Wed, 24 Oct 2018 07:16:45 +0200 Subject: [PATCH 059/648] renamed scheduletime to queuetime, fixes issue #15 --- cobald_sim/job.py | 8 ++++---- cobald_sim/job_io/htcondor.py | 4 ++-- cobald_sim/job_io/swf.py | 4 ++-- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/cobald_sim/job.py b/cobald_sim/job.py index 08e79e0..52e34e8 100644 --- a/cobald_sim/job.py +++ b/cobald_sim/job.py @@ -34,12 +34,12 @@ def job_demand(env): class Job(object): - def __init__(self, env, walltime, resources, used_resources=None, in_queue_since=0, schedule_date=0): + def __init__(self, env, walltime, resources, used_resources=None, in_queue_since=0, queue_date=0): self.env = env self.resources = resources self.used_resources = used_resources self.walltime = float(walltime) - self.schedule_date = schedule_date + self.queue_date = queue_date self.in_queue_since = in_queue_since self.in_queue_until = None self.processing = None @@ -77,14 +77,14 @@ def htcondor_export_job_generator(filename, job_queue, env=None, **kwargs): with open(filename, "r") as input_file: reader = htcondor_job_reader(env, input_file) job = next(reader) - base_date = job.schedule_date + base_date = job.queue_date current_time = 0 count = 0 while True: if not job: job = next(reader) - current_time = job.schedule_date - base_date + current_time = job.queue_date - base_date if env.now >= current_time: count += 1 job.in_queue_since = env.now diff --git a/cobald_sim/job_io/htcondor.py b/cobald_sim/job_io/htcondor.py index 3016c46..4d31803 100644 --- a/cobald_sim/job_io/htcondor.py +++ b/cobald_sim/job_io/htcondor.py @@ -9,7 +9,7 @@ def htcondor_job_reader(env, iterable, resource_name_mapping={ "memory": "RequestMemory", "disk": "RequestDisk" }, used_resource_name_mapping={ - "scheduletime": "QDate", + "queuetime": "QDate", "walltime": "RemoteWallClockTime", "cores": "Number of Allocated Processors", "memory": "MemoryUsage", @@ -30,4 +30,4 @@ def htcondor_job_reader(env, iterable, resource_name_mapping={ float(row[used_resource_name_mapping["walltime"]]), "memory": float(row[used_resource_name_mapping["memory"]]), # "disk": float(row[used_resource_name_mapping["disk"]]) - }, schedule_date=float(row[used_resource_name_mapping["scheduletime"]])) + }, queue_date=float(row[used_resource_name_mapping["queuetime"]])) diff --git a/cobald_sim/job_io/swf.py b/cobald_sim/job_io/swf.py index 350f7b5..52b816a 100644 --- a/cobald_sim/job_io/swf.py +++ b/cobald_sim/job_io/swf.py @@ -11,7 +11,7 @@ def swf_job_reader(env, iterable, resource_name_mapping={ "walltime": "Run Time", "cores": "Number of Allocated Processors", "memory": "Used Memory", - "scheduletime": "Submit Time" + "queuetime": "Submit Time" }): header = { "Job Number": 0, @@ -47,4 +47,4 @@ def swf_job_reader(env, iterable, resource_name_mapping={ key: float(row[header[used_resource_name_mapping[key]]]) for key in ("cores", "memory") if float(row[header[used_resource_name_mapping[key]]]) >= 0 - }, schedule_date=float(row[header[used_resource_name_mapping["scheduletime"]]])) + }, queue_date=float(row[header[used_resource_name_mapping["queuetime"]]])) From b522cb728b9cf34bcab0858d91d12240f21421b6 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Wed, 24 Oct 2018 22:42:35 +0200 Subject: [PATCH 060/648] condor export generator now works on file and not file name --- cobald_sim/job.py | 41 ++++++++++++++++++++--------------------- main.py | 23 ++++++++++++----------- 2 files changed, 32 insertions(+), 32 deletions(-) diff --git a/cobald_sim/job.py b/cobald_sim/job.py index 52e34e8..dadf610 100644 --- a/cobald_sim/job.py +++ b/cobald_sim/job.py @@ -71,27 +71,26 @@ def job_property_generator(**kwargs): yield 10, {"memory": 8, "cores": 1, "disk": 100} -def htcondor_export_job_generator(filename, job_queue, env=None, **kwargs): +def htcondor_export_job_generator(input_file, job_queue, env=None, **kwargs): from .job_io.htcondor import htcondor_job_reader - with open(filename, "r") as input_file: - reader = htcondor_job_reader(env, input_file) - job = next(reader) - base_date = job.queue_date - current_time = 0 + reader = htcondor_job_reader(env, input_file) + job = next(reader) + base_date = job.queue_date + current_time = 0 - count = 0 - while True: - if not job: - job = next(reader) - current_time = job.queue_date - base_date - if env.now >= current_time: - count += 1 - job.in_queue_since = env.now - job_queue.append(job) - job = None - else: - if count > 0: - logging.getLogger("general").info(str(round(env.now)), {"user_demand_new": count}) - count = 0 - yield env.timeout(1) + count = 0 + while True: + if not job: + job = next(reader) + current_time = job.queue_date - base_date + if env.now >= current_time: + count += 1 + job.in_queue_since = env.now + job_queue.append(job) + job = None + else: + if count > 0: + logging.getLogger("general").info(str(round(env.now)), {"user_demand_new": count}) + count = 0 + yield env.timeout(1) diff --git a/main.py b/main.py index 58bb26f..e751749 100644 --- a/main.py +++ b/main.py @@ -207,17 +207,18 @@ def main(filename="condor_usage_sorted_filtered.csv", until=2000): random.seed(1234) env = simpy.Environment() trace(env, monitor_data, resource_normalisation=resource_normalisation) - globals.job_generator = htcondor_export_job_generator(filename=filename, - job_queue=globals.job_queue, - env=env) - env.process(globals.job_generator) - for resources in [{"memory": 5000, "cores": 1}, {"memory": 24000, "cores": 8}, {"memory": 16000, "cores": 4}]: - pool = Pool(env, resources=resources) - globals.pools.append(pool) - SimulatedCostController(env, target=pool, rate=1) - globals.pools.append(StaticPool(env, init=2)) - globals.job_scheduler = CondorJobScheduler(env=env, job_queue=globals.job_queue) - env.run(until=until) + with open(filename, "r") as input_file: + globals.job_generator = htcondor_export_job_generator(input_file=input_file, + job_queue=globals.job_queue, + env=env) + env.process(globals.job_generator) + for resources in [{"memory": 5000, "cores": 1}, {"memory": 24000, "cores": 8}, {"memory": 16000, "cores": 4}]: + pool = Pool(env, resources=resources) + globals.pools.append(pool) + SimulatedCostController(env, target=pool, rate=1) + globals.pools.append(StaticPool(env, init=2)) + globals.job_scheduler = CondorJobScheduler(env=env, job_queue=globals.job_queue) + env.run(until=until) if __name__ == "__main__": From 145244387e58da0b1a5c0c3c6b603e052a57c6c3 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Wed, 24 Oct 2018 22:43:15 +0200 Subject: [PATCH 061/648] resources currently only consider cores and memory --- cobald_sim/job_io/htcondor.py | 2 +- cobald_sim/pool_io/htcondor.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cobald_sim/job_io/htcondor.py b/cobald_sim/job_io/htcondor.py index 4d31803..07c6129 100644 --- a/cobald_sim/job_io/htcondor.py +++ b/cobald_sim/job_io/htcondor.py @@ -22,7 +22,7 @@ def htcondor_job_reader(env, iterable, resource_name_mapping={ env, walltime=row[resource_name_mapping["walltime"]], resources={ - "cores": int(row[resource_name_mapping["cores"]]), + "cores": float(row[resource_name_mapping["cores"]]), # "disk": float(row[resource_name_mapping["disk"]]), "memory": float(row[resource_name_mapping["memory"]]) }, used_resources={ diff --git a/cobald_sim/pool_io/htcondor.py b/cobald_sim/pool_io/htcondor.py index db1125b..26fcfde 100644 --- a/cobald_sim/pool_io/htcondor.py +++ b/cobald_sim/pool_io/htcondor.py @@ -20,4 +20,4 @@ def htcondor_pool_reader(env, iterable, resource_name_mapping={ yield StaticPool( env, init=int(row["Count"]), - resources={key: row[value] for key, value in resource_name_mapping.items()}) + resources={key: float(row[value]) for key, value in resource_name_mapping.items() if key in ["cores", "memory"]}) From c0ef764bdf1a1adae6af9085c29481f2eb32e97c Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Wed, 24 Oct 2018 22:44:57 +0200 Subject: [PATCH 062/648] added functionality for static simulations via cli, related to issue #13 --- cobald_sim/cli/__init__.py | 0 cobald_sim/cli/simulate.py | 163 +++++++++++++++++++++++++++++++++++++ 2 files changed, 163 insertions(+) create mode 100644 cobald_sim/cli/__init__.py create mode 100644 cobald_sim/cli/simulate.py diff --git a/cobald_sim/cli/__init__.py b/cobald_sim/cli/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/cobald_sim/cli/simulate.py b/cobald_sim/cli/simulate.py new file mode 100644 index 0000000..e357061 --- /dev/null +++ b/cobald_sim/cli/simulate.py @@ -0,0 +1,163 @@ +import random +from functools import partial, wraps + +import click +import simpy +import logging.handlers + +from cobald.monitor.format_json import JsonFormatter + +from cobald_sim.cost import cobald_cost +from cobald_sim.job import htcondor_export_job_generator +from cobald_sim.job_io.htcondor import htcondor_job_reader +from cobald_sim.pool_io.htcondor import htcondor_pool_reader +from cobald_sim.job_io.swf import swf_job_reader + +from cobald_sim import globals +from cobald_sim.scheduler import CondorJobScheduler + + +class JSONSocketHandler(logging.handlers.SocketHandler): + def makePickle(self, record): + return self.format(record).encode() + + +monitoring_logger = logging.getLogger("general") +monitoring_logger.setLevel(logging.DEBUG) +socketHandler = JSONSocketHandler( + 'localhost', + logging.handlers.DEFAULT_TCP_LOGGING_PORT) +streamHandler = logging.StreamHandler() +socketHandler.setFormatter(JsonFormatter()) +streamHandler.setFormatter(JsonFormatter()) +monitoring_logger.addHandler(socketHandler) +monitoring_logger.addHandler(streamHandler) + +last_step = 0 + +job_import_mapper = { + "htcondor": htcondor_job_reader, + "swf": swf_job_reader +} + +pool_import_mapper = { + "htcondor": htcondor_pool_reader +} + + +def trace(env, callback, resource_normalisation): + def get_wrapper(env_step, callback): + @wraps(env_step) + def tracing_step(): + if len(env._queue): + t, prio, eid, event = env._queue[0] + callback(t, prio, eid, event, resource_normalisation) + return env_step() + return tracing_step + env.step = get_wrapper(env.step, callback) + + +def monitor(data, t, prio, eid, event, resource_normalisation): + if event.value: + if isinstance(event.value, simpy.exceptions.Interrupt): + job = event.value.cause + for resource_key, usage in job.used_resources.items(): + value = usage / job.resources[resource_key] + if value > 1: + monitoring_logger.info(str(round(t)), {"job_exceeds_%s" % resource_key: value}) + if isinstance(event.value, Job): + monitoring_logger.info(str(round(t)), {"job_waiting_times": event.value.waiting_time}) + global last_step + if t > last_step: + # new data to be recorded + tmp = round(t) + last_step = tmp + pool_demand = 0 + pool_supply = 0 + pool_utilisation = 0 + pool_allocation = 0 + running_jobs = 0 + used_resources = 0 + unused_resources = 0 + available_resources = 0 + empty_drones = 0 + result = {} + for pool in globals.pools: + pool_demand += pool.demand + pool_supply += pool.supply + result["pool_%s_supply" % id(pool)] = pool.supply + pool_utilisation += pool.utilisation + pool_allocation += pool.allocation + for drone in pool.drones: + running_jobs += drone.jobs + if drone.allocation == 0: + empty_drones += 1 + for resource_key, usage in drone.resources.items(): + normalisation_factor = resource_normalisation.get(resource_key, 1) + used_resources += usage / normalisation_factor + unused_resources += (pool.resources[resource_key] - usage) / normalisation_factor + available_resources += pool.resources[resource_key] / normalisation_factor + result["user_demand"] = len(globals.job_queue) + result["pool_demand"] = pool_demand + result["pool_supply"] = pool_supply + result["pool_utilisation"] = pool_utilisation + result["pool_allocation"] = pool_allocation + result["running_jobs"] = running_jobs + result["empty_drones"] = empty_drones + result["used_resources"] = used_resources + result["unused_resources"] = unused_resources + result["available_resources"] = available_resources + current_cost = cobald_cost() + result["cost"] = current_cost + globals.cost += current_cost + result["acc_cost"] = globals.cost + monitoring_logger.info(str(tmp), result) + + +@click.group() +@click.option("--seed", type=int, default=1234) +@click.pass_context +def cli(ctx, seed): + ctx.ensure_object(dict) + ctx.obj['seed'] = seed + + +@cli.command() +@click.option("--job_file", type=(click.File("r"), click.Choice(list(job_import_mapper.keys())))) +@click.option("--pool_file", type=(click.File("r"), click.Choice(list(pool_import_mapper.keys()))), multiple=True) +@click.pass_context +def static(ctx, job_file, pool_file, until=2000): + click.echo("starting static environment") + random.seed(ctx.obj["seed"]) + resource_normalisation = {"memory": 2000} + monitor_data = partial(monitor, resource_normalisation) + + env = simpy.Environment() + trace(env, monitor_data, resource_normalisation=resource_normalisation) + file, file_type = job_file + globals.job_generator = htcondor_export_job_generator(input_file=file, + job_queue=globals.job_queue, + env=env) + for current_pool in pool_file: + file, file_type = current_pool + for pool in pool_import_mapper[file_type](env=env, iterable=file): + globals.pools.append(pool) + env.process(globals.job_generator) + globals.job_scheduler = CondorJobScheduler(env=env, job_queue=globals.job_queue) + env.run(until=until) + + +@cli.command() +@click.pass_context +def dynamic(ctx): + click.echo("starting dynamic environment") + + +@cli.command() +@click.pass_context +def hybrid(ctx): + click.echo("starting hybrid environment") + + +if __name__ == '__main__': + cli() From badb5c30764dd0875342911e5e668e1e696c5c1e Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Wed, 24 Oct 2018 22:55:43 +0200 Subject: [PATCH 063/648] renamed htcondor_epxort_job_generator and removed its dependency to htcondor reader, fixes issue #16 --- cobald_sim/cli/simulate.py | 8 ++++---- cobald_sim/job.py | 9 +++------ main.py | 9 +++++---- 3 files changed, 12 insertions(+), 14 deletions(-) diff --git a/cobald_sim/cli/simulate.py b/cobald_sim/cli/simulate.py index e357061..cadb8e9 100644 --- a/cobald_sim/cli/simulate.py +++ b/cobald_sim/cli/simulate.py @@ -8,7 +8,7 @@ from cobald.monitor.format_json import JsonFormatter from cobald_sim.cost import cobald_cost -from cobald_sim.job import htcondor_export_job_generator +from cobald_sim.job import job_to_queue_scheduler from cobald_sim.job_io.htcondor import htcondor_job_reader from cobald_sim.pool_io.htcondor import htcondor_pool_reader from cobald_sim.job_io.swf import swf_job_reader @@ -135,9 +135,9 @@ def static(ctx, job_file, pool_file, until=2000): env = simpy.Environment() trace(env, monitor_data, resource_normalisation=resource_normalisation) file, file_type = job_file - globals.job_generator = htcondor_export_job_generator(input_file=file, - job_queue=globals.job_queue, - env=env) + globals.job_generator = job_to_queue_scheduler(job_generator=job_import_mapper[file_type](env, file), + job_queue=globals.job_queue, + env=env) for current_pool in pool_file: file, file_type = current_pool for pool in pool_import_mapper[file_type](env=env, iterable=file): diff --git a/cobald_sim/job.py b/cobald_sim/job.py index dadf610..181dd27 100644 --- a/cobald_sim/job.py +++ b/cobald_sim/job.py @@ -71,18 +71,15 @@ def job_property_generator(**kwargs): yield 10, {"memory": 8, "cores": 1, "disk": 100} -def htcondor_export_job_generator(input_file, job_queue, env=None, **kwargs): - from .job_io.htcondor import htcondor_job_reader - - reader = htcondor_job_reader(env, input_file) - job = next(reader) +def job_to_queue_scheduler(job_generator, job_queue, env=None, **kwargs): + job = next(job_generator) base_date = job.queue_date current_time = 0 count = 0 while True: if not job: - job = next(reader) + job = next(job_generator) current_time = job.queue_date - base_date if env.now >= current_time: count += 1 diff --git a/main.py b/main.py index e751749..8cdbe39 100644 --- a/main.py +++ b/main.py @@ -10,10 +10,11 @@ from cobald_sim import globals from cobald_sim.cost import cobald_cost -from cobald_sim.job import htcondor_export_job_generator, Job +from cobald_sim.job import job_to_queue_scheduler, Job from cobald_sim.scheduler import CondorJobScheduler from cobald_sim.pool import Pool, StaticPool from cobald_sim.controller import SimulatedCostController +from cobald_sim.job_io.htcondor import htcondor_job_reader class JSONSocketHandler(logging.handlers.SocketHandler): @@ -208,9 +209,9 @@ def main(filename="condor_usage_sorted_filtered.csv", until=2000): env = simpy.Environment() trace(env, monitor_data, resource_normalisation=resource_normalisation) with open(filename, "r") as input_file: - globals.job_generator = htcondor_export_job_generator(input_file=input_file, - job_queue=globals.job_queue, - env=env) + globals.job_generator = job_to_queue_scheduler(job_generator=htcondor_job_reader(env, input_file), + job_queue=globals.job_queue, + env=env) env.process(globals.job_generator) for resources in [{"memory": 5000, "cores": 1}, {"memory": 24000, "cores": 8}, {"memory": 16000, "cores": 4}]: pool = Pool(env, resources=resources) From ca9c1bcf155f4f450c1a97be1d83770dea90cf5a Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Wed, 24 Oct 2018 22:59:39 +0200 Subject: [PATCH 064/648] included until into parameters, related to issue #13 --- cobald_sim/cli/simulate.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cobald_sim/cli/simulate.py b/cobald_sim/cli/simulate.py index cadb8e9..014e52c 100644 --- a/cobald_sim/cli/simulate.py +++ b/cobald_sim/cli/simulate.py @@ -125,8 +125,9 @@ def cli(ctx, seed): @cli.command() @click.option("--job_file", type=(click.File("r"), click.Choice(list(job_import_mapper.keys())))) @click.option("--pool_file", type=(click.File("r"), click.Choice(list(pool_import_mapper.keys()))), multiple=True) +@click.option("--until", default=2000) @click.pass_context -def static(ctx, job_file, pool_file, until=2000): +def static(ctx, job_file, pool_file, until): click.echo("starting static environment") random.seed(ctx.obj["seed"]) resource_normalisation = {"memory": 2000} From 2153ace33417ad89181be4fa56b744137bce53bd Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Thu, 25 Oct 2018 20:07:05 +0200 Subject: [PATCH 065/648] htcondor pool importer now can yield different types of pools, fixes issue #17 --- cobald_sim/cli/simulate.py | 3 ++- cobald_sim/pool.py | 14 +++++++------- cobald_sim/pool_io/htcondor.py | 9 +++++---- main.py | 2 +- 4 files changed, 15 insertions(+), 13 deletions(-) diff --git a/cobald_sim/cli/simulate.py b/cobald_sim/cli/simulate.py index 014e52c..8d0d5e1 100644 --- a/cobald_sim/cli/simulate.py +++ b/cobald_sim/cli/simulate.py @@ -10,6 +10,7 @@ from cobald_sim.cost import cobald_cost from cobald_sim.job import job_to_queue_scheduler from cobald_sim.job_io.htcondor import htcondor_job_reader +from cobald_sim.pool import StaticPool from cobald_sim.pool_io.htcondor import htcondor_pool_reader from cobald_sim.job_io.swf import swf_job_reader @@ -141,7 +142,7 @@ def static(ctx, job_file, pool_file, until): env=env) for current_pool in pool_file: file, file_type = current_pool - for pool in pool_import_mapper[file_type](env=env, iterable=file): + for pool in pool_import_mapper[file_type](env=env, iterable=file, pool_type=StaticPool): globals.pools.append(pool) env.process(globals.job_generator) globals.job_scheduler = CondorJobScheduler(env=env, job_queue=globals.job_queue) diff --git a/cobald_sim/pool.py b/cobald_sim/pool.py index ad83df0..4afa3f7 100644 --- a/cobald_sim/pool.py +++ b/cobald_sim/pool.py @@ -5,7 +5,7 @@ class Pool(interfaces.Pool, container.Container): - def __init__(self, env, capacity=float('inf'), init=0, resources={"memory": 8, "cores": 1}): + def __init__(self, env, capacity=float('inf'), init=0, resources={"memory": 8000, "cores": 1}): super(Pool, self).__init__(env, capacity, init) self.resources = resources self._demand = 1 @@ -83,13 +83,13 @@ def demand(self, value): class StaticPool(Pool): - def __init__(self, env, init=0, resources={"memory": 8, "cores": 1}): - assert init > 0, "Static pool was initialised without any resources..." - super(StaticPool, self).__init__(env, capacity=init, init=init, resources=resources) - self._demand = init - for _ in range(init): + def __init__(self, env, capacity=0, resources={"memory": 8000, "cores": 1}): + assert capacity > 0, "Static pool was initialised without any resources..." + super(StaticPool, self).__init__(env, capacity=capacity, init=capacity, resources=resources) + self._demand = capacity + for _ in range(capacity): self._drones.append(Drone(self.env, self.resources, 0)) - self.put(init) + self.put(capacity) def run(self): while True: diff --git a/cobald_sim/pool_io/htcondor.py b/cobald_sim/pool_io/htcondor.py index 26fcfde..e443294 100644 --- a/cobald_sim/pool_io/htcondor.py +++ b/cobald_sim/pool_io/htcondor.py @@ -1,23 +1,24 @@ import csv -from ..pool import StaticPool +from ..pool import Pool def htcondor_pool_reader(env, iterable, resource_name_mapping={ "cores": "TotalSlotCPUs", "disk": "TotalSlotDisk", "memory": "TotalSlotMemory" -}): +}, pool_type=Pool): """ Load a pool configuration that was exported via htcondor from files or iterables :param iterable: an iterable yielding lines of CSV, such as an open file :param resource_name_mapping: Mapping from given header names to well-defined resources in simulation + :param pool_type: The type of pool to be yielded :return: Yields the :py:class:`StaticPool`s found in the given iterable """ reader = csv.DictReader(iterable, delimiter=' ', skipinitialspace=True) for row_idx, row in enumerate(reader): - yield StaticPool( + yield pool_type( env, - init=int(row["Count"]), + capacity=int(row["Count"]), resources={key: float(row[value]) for key, value in resource_name_mapping.items() if key in ["cores", "memory"]}) diff --git a/main.py b/main.py index 8cdbe39..1ccd973 100644 --- a/main.py +++ b/main.py @@ -217,7 +217,7 @@ def main(filename="condor_usage_sorted_filtered.csv", until=2000): pool = Pool(env, resources=resources) globals.pools.append(pool) SimulatedCostController(env, target=pool, rate=1) - globals.pools.append(StaticPool(env, init=2)) + globals.pools.append(StaticPool(env, capacity=2)) globals.job_scheduler = CondorJobScheduler(env=env, job_queue=globals.job_queue) env.run(until=until) From 44e523ca1db2074df445d60ad40299967947bb7a Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Thu, 25 Oct 2018 20:12:48 +0200 Subject: [PATCH 066/648] added basic implementation for dynamic simulation, related to issue #13 --- cobald_sim/cli/simulate.py | 34 ++++++++++++++++++++++++++++------ 1 file changed, 28 insertions(+), 6 deletions(-) diff --git a/cobald_sim/cli/simulate.py b/cobald_sim/cli/simulate.py index 8d0d5e1..07cd722 100644 --- a/cobald_sim/cli/simulate.py +++ b/cobald_sim/cli/simulate.py @@ -7,10 +7,11 @@ from cobald.monitor.format_json import JsonFormatter +from cobald_sim.controller import SimulatedCostController from cobald_sim.cost import cobald_cost from cobald_sim.job import job_to_queue_scheduler from cobald_sim.job_io.htcondor import htcondor_job_reader -from cobald_sim.pool import StaticPool +from cobald_sim.pool import StaticPool, Pool from cobald_sim.pool_io.htcondor import htcondor_pool_reader from cobald_sim.job_io.swf import swf_job_reader @@ -117,18 +118,19 @@ def monitor(data, t, prio, eid, event, resource_normalisation): @click.group() @click.option("--seed", type=int, default=1234) +@click.option("--until", default=2000) @click.pass_context -def cli(ctx, seed): +def cli(ctx, seed, until): ctx.ensure_object(dict) ctx.obj['seed'] = seed + ctx.obj['until'] = until @cli.command() @click.option("--job_file", type=(click.File("r"), click.Choice(list(job_import_mapper.keys())))) @click.option("--pool_file", type=(click.File("r"), click.Choice(list(pool_import_mapper.keys()))), multiple=True) -@click.option("--until", default=2000) @click.pass_context -def static(ctx, job_file, pool_file, until): +def static(ctx, job_file, pool_file): click.echo("starting static environment") random.seed(ctx.obj["seed"]) resource_normalisation = {"memory": 2000} @@ -146,13 +148,33 @@ def static(ctx, job_file, pool_file, until): globals.pools.append(pool) env.process(globals.job_generator) globals.job_scheduler = CondorJobScheduler(env=env, job_queue=globals.job_queue) - env.run(until=until) + env.run(until=ctx.obj["until"]) @cli.command() +@click.option("--job_file", type=(click.File("r"), click.Choice(list(job_import_mapper.keys())))) +@click.option("--pool_file", type=(click.File("r"), click.Choice(list(pool_import_mapper.keys()))), multiple=True) @click.pass_context -def dynamic(ctx): +def dynamic(ctx, job_file, pool_file): click.echo("starting dynamic environment") + random.seed(ctx.obj["seed"]) + resource_normalisation = {"memory": 2000} + monitor_data = partial(monitor, resource_normalisation) + + env = simpy.Environment() + trace(env, monitor_data, resource_normalisation=resource_normalisation) + file, file_type = job_file + globals.job_generator = job_to_queue_scheduler(job_generator=job_import_mapper[file_type](env, file), + job_queue=globals.job_queue, + env=env) + for current_pool in pool_file: + file, file_type = current_pool + for pool in pool_import_mapper[file_type](env=env, iterable=file, pool_type=Pool): + globals.pools.append(pool) + SimulatedCostController(env, target=pool, rate=1) + env.process(globals.job_generator) + globals.job_scheduler = CondorJobScheduler(env=env, job_queue=globals.job_queue) + env.run(until=ctx.obj["until"]) @cli.command() From a753766d5a50327b0e771f8a841928656daeb4c9 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Thu, 25 Oct 2018 21:49:37 +0200 Subject: [PATCH 067/648] provided basic implementation for hybrid simulation cli, fixes issue #13 --- cobald_sim/cli/simulate.py | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/cobald_sim/cli/simulate.py b/cobald_sim/cli/simulate.py index 07cd722..f3b3961 100644 --- a/cobald_sim/cli/simulate.py +++ b/cobald_sim/cli/simulate.py @@ -178,9 +178,34 @@ def dynamic(ctx, job_file, pool_file): @cli.command() +@click.option("--job_file", type=(click.File("r"), click.Choice(list(job_import_mapper.keys())))) +@click.option("--static_pool_file", type=(click.File("r"), click.Choice(list(pool_import_mapper.keys()))), multiple=True) +@click.option("--dynamic_pool_file", type=(click.File("r"), click.Choice(list(pool_import_mapper.keys()))), multiple=True) @click.pass_context -def hybrid(ctx): +def hybrid(ctx, job_file, static_pool_file, dynamic_pool_file): click.echo("starting hybrid environment") + random.seed(ctx.obj["seed"]) + resource_normalisation = {"memory": 2000} + monitor_data = partial(monitor, resource_normalisation) + + env = simpy.Environment() + trace(env, monitor_data, resource_normalisation=resource_normalisation) + file, file_type = job_file + globals.job_generator = job_to_queue_scheduler(job_generator=job_import_mapper[file_type](env, file), + job_queue=globals.job_queue, + env=env) + for current_pool in static_pool_file: + file, file_type = current_pool + for pool in pool_import_mapper[file_type](env=env, iterable=file, pool_type=StaticPool): + globals.pools.append(pool) + for current_pool in dynamic_pool_file: + file, file_type = current_pool + for pool in pool_import_mapper[file_type](env=env, iterable=file, pool_type=Pool): + globals.pools.append(pool) + SimulatedCostController(env, target=pool, rate=1) + env.process(globals.job_generator) + globals.job_scheduler = CondorJobScheduler(env=env, job_queue=globals.job_queue) + env.run(until=ctx.obj["until"]) if __name__ == '__main__': From a856279d7ab86c996d3874ae15b56e708337d525 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Thu, 25 Oct 2018 22:04:37 +0200 Subject: [PATCH 068/648] walltime of jobs is now initialised by resource dicts, solves issue #14 --- cobald_sim/job.py | 8 +++++--- cobald_sim/job_io/htcondor.py | 9 ++++----- cobald_sim/job_io/swf.py | 5 ++--- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/cobald_sim/job.py b/cobald_sim/job.py index 181dd27..a0ba397 100644 --- a/cobald_sim/job.py +++ b/cobald_sim/job.py @@ -34,11 +34,13 @@ def job_demand(env): class Job(object): - def __init__(self, env, walltime, resources, used_resources=None, in_queue_since=0, queue_date=0): + def __init__(self, env, resources, used_resources=None, in_queue_since=0, queue_date=0): self.env = env self.resources = resources self.used_resources = used_resources - self.walltime = float(walltime) + self.walltime = used_resources.pop("walltime", None) + self.requested_walltime = resources.pop("walltime", None) + assert self.walltime or self.requested_walltime, "Job does not provide any walltime" self.queue_date = queue_date self.in_queue_since = in_queue_since self.in_queue_until = None @@ -57,7 +59,7 @@ def process(self): def _process(self): try: - yield self.env.timeout(self.walltime, value=self) + yield self.env.timeout(self.requested_walltime or self.walltime, value=self) except simpy.exceptions.Interrupt: pass diff --git a/cobald_sim/job_io/htcondor.py b/cobald_sim/job_io/htcondor.py index 07c6129..2da7734 100644 --- a/cobald_sim/job_io/htcondor.py +++ b/cobald_sim/job_io/htcondor.py @@ -20,14 +20,13 @@ def htcondor_job_reader(env, iterable, resource_name_mapping={ for row in htcondor_reader: yield Job( env, - walltime=row[resource_name_mapping["walltime"]], resources={ - "cores": float(row[resource_name_mapping["cores"]]), - # "disk": float(row[resource_name_mapping["disk"]]), - "memory": float(row[resource_name_mapping["memory"]]) - }, used_resources={ + key: float(row[resource_name_mapping[key]]) for key in resource_name_mapping + if key in ["cores", "memory", "walltime"]}, + used_resources={ "cores": (float(row["RemoteSysCpu"]) + float(row["RemoteUserCpu"])) / float(row[used_resource_name_mapping["walltime"]]), "memory": float(row[used_resource_name_mapping["memory"]]), + "walltime": float(row[used_resource_name_mapping["walltime"]]), # "disk": float(row[used_resource_name_mapping["disk"]]) }, queue_date=float(row[used_resource_name_mapping["queuetime"]])) diff --git a/cobald_sim/job_io/swf.py b/cobald_sim/job_io/swf.py index 52b816a..676a35a 100644 --- a/cobald_sim/job_io/swf.py +++ b/cobald_sim/job_io/swf.py @@ -37,14 +37,13 @@ def swf_job_reader(env, iterable, resource_name_mapping={ for row in reader: yield Job( env, - walltime=row[header[resource_name_mapping["walltime"]]], resources={ key: float(row[header[resource_name_mapping[key]]]) - for key in ("cores", "memory") + for key in ("cores", "memory", "walltime") if float(row[header[resource_name_mapping[key]]]) >= 0 }, used_resources={ key: float(row[header[used_resource_name_mapping[key]]]) - for key in ("cores", "memory") + for key in ("cores", "memory", "walltime") if float(row[header[used_resource_name_mapping[key]]]) >= 0 }, queue_date=float(row[header[used_resource_name_mapping["queuetime"]]])) From 01ce5c706f97b65234616097dbfa7a1573850cdb Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Fri, 26 Oct 2018 20:46:13 +0200 Subject: [PATCH 069/648] now also disk is correctly handled as resource including management of missing resources in requested vs used resources, closes issue #19 --- cobald_sim/drone.py | 32 ++++++++++++++++++++++++-------- cobald_sim/job_io/htcondor.py | 5 ++--- cobald_sim/job_io/swf.py | 2 +- cobald_sim/pool_io/htcondor.py | 2 +- cobald_sim/scheduler.py | 2 ++ 5 files changed, 30 insertions(+), 13 deletions(-) diff --git a/cobald_sim/drone.py b/cobald_sim/drone.py index 12a99f9..7902c40 100644 --- a/cobald_sim/drone.py +++ b/cobald_sim/drone.py @@ -69,22 +69,38 @@ def start_job(self, job, kill=False): self.jobs += 1 job_execution = job.process() for resource_key in job.resources: - if self.used_resources[resource_key] + job.used_resources[resource_key] > self.pool_resources[resource_key]: - job.kill() - if job.resources[resource_key] < job.used_resources[resource_key]: - if kill: + try: + if self.used_resources[resource_key] + job.used_resources[resource_key] > self.pool_resources[resource_key]: job.kill() - else: - pass + except KeyError: + # we do not have data about how many resources the job used, so check with requested data + if self.used_resources[resource_key] + job.resources[resource_key] > self.pool_resources[resource_key]: + job.kill() + try: + if job.resources[resource_key] < job.used_resources[resource_key]: + if kill: + job.kill() + else: + pass + except KeyError: + # check is not relevant if the data is not stored + pass for resource_key in job.resources: self.resources[resource_key] += job.resources[resource_key] - self.used_resources[resource_key] += job.used_resources[resource_key] + try: + self.used_resources[resource_key] += job.used_resources[resource_key] + except KeyError: + self.used_resources[resource_key] += job.resources[resource_key] yield job_execution self.jobs -= 1 self._utilisation = None self._allocation = None for resource_key in job.resources: self.resources[resource_key] -= job.resources[resource_key] - self.used_resources[resource_key] -= job.used_resources[resource_key] + for resource_key in {*job.resources, *job.used_resources}: + try: + self.used_resources[resource_key] -= job.used_resources[resource_key] + except KeyError: + self.used_resources[resource_key] -= job.resources[resource_key] # put drone back into pool queue # print("[drone %s] finished job at %d" % (self, self.env.now)) diff --git a/cobald_sim/job_io/htcondor.py b/cobald_sim/job_io/htcondor.py index 2da7734..35e4bee 100644 --- a/cobald_sim/job_io/htcondor.py +++ b/cobald_sim/job_io/htcondor.py @@ -22,11 +22,10 @@ def htcondor_job_reader(env, iterable, resource_name_mapping={ env, resources={ key: float(row[resource_name_mapping[key]]) for key in resource_name_mapping - if key in ["cores", "memory", "walltime"]}, - used_resources={ + }, used_resources={ "cores": (float(row["RemoteSysCpu"]) + float(row["RemoteUserCpu"])) / float(row[used_resource_name_mapping["walltime"]]), "memory": float(row[used_resource_name_mapping["memory"]]), "walltime": float(row[used_resource_name_mapping["walltime"]]), - # "disk": float(row[used_resource_name_mapping["disk"]]) + "disk": float(row[used_resource_name_mapping["disk"]]) }, queue_date=float(row[used_resource_name_mapping["queuetime"]])) diff --git a/cobald_sim/job_io/swf.py b/cobald_sim/job_io/swf.py index 676a35a..babf59c 100644 --- a/cobald_sim/job_io/swf.py +++ b/cobald_sim/job_io/swf.py @@ -44,6 +44,6 @@ def swf_job_reader(env, iterable, resource_name_mapping={ }, used_resources={ key: float(row[header[used_resource_name_mapping[key]]]) - for key in ("cores", "memory", "walltime") + for key in used_resource_name_mapping.keys() if float(row[header[used_resource_name_mapping[key]]]) >= 0 }, queue_date=float(row[header[used_resource_name_mapping["queuetime"]]])) diff --git a/cobald_sim/pool_io/htcondor.py b/cobald_sim/pool_io/htcondor.py index e443294..79696a3 100644 --- a/cobald_sim/pool_io/htcondor.py +++ b/cobald_sim/pool_io/htcondor.py @@ -21,4 +21,4 @@ def htcondor_pool_reader(env, iterable, resource_name_mapping={ yield pool_type( env, capacity=int(row["Count"]), - resources={key: float(row[value]) for key, value in resource_name_mapping.items() if key in ["cores", "memory"]}) + resources={key: float(row[value]) for key, value in resource_name_mapping.items()}) diff --git a/cobald_sim/scheduler.py b/cobald_sim/scheduler.py index 8dccb9c..f30bdad 100644 --- a/cobald_sim/scheduler.py +++ b/cobald_sim/scheduler.py @@ -51,6 +51,8 @@ def _schedule_job(self, job): for resource_type in resource_types: if resource_type not in drone.resources.keys(): cost = float("Inf") + elif resource_type not in job.resources: + cost += drone.resources[resource_type] - drone.resources[resource_type] elif (pool.resources[resource_type] - drone.resources[resource_type]) < \ job.resources[resource_type]: cost = float("Inf") From e3e411e6be9f946de03d508fb91493bca6fc8253 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Sat, 27 Oct 2018 21:31:35 +0200 Subject: [PATCH 070/648] removed reference to globals --- cobald_sim/cost.py | 9 +++------ cobald_sim/globals.py | 5 ----- cobald_sim/job.py | 8 ++++---- 3 files changed, 7 insertions(+), 15 deletions(-) delete mode 100644 cobald_sim/globals.py diff --git a/cobald_sim/cost.py b/cobald_sim/cost.py index 69059d0..72b5dd5 100644 --- a/cobald_sim/cost.py +++ b/cobald_sim/cost.py @@ -1,9 +1,6 @@ -from cobald_sim import globals - - -def cobald_cost(): - result = len(globals.job_queue) - for pool in globals.pools: +def cobald_cost(simulator): + result = len(simulator.job_queue) + for pool in simulator.pools: for drone in pool.drones: result += 1 tmp = 0 diff --git a/cobald_sim/globals.py b/cobald_sim/globals.py deleted file mode 100644 index e1ac98c..0000000 --- a/cobald_sim/globals.py +++ /dev/null @@ -1,5 +0,0 @@ -pools = [] -job_queue = [] -job_generator = None -job_scheduler = None -cost = 0 diff --git a/cobald_sim/job.py b/cobald_sim/job.py index a0ba397..c97d068 100644 --- a/cobald_sim/job.py +++ b/cobald_sim/job.py @@ -4,7 +4,7 @@ import logging -def job_demand(env): +def job_demand(simulator): """ function randomly sets global user demand by using different strategies :param env: @@ -25,11 +25,11 @@ def job_demand(env): # sqrt # print("strategy: sqrt amount") amount = math.sqrt(random.random()*random.random()*100) - value = yield env.timeout(delay=delay, value=amount) + value = yield simulator.env.timeout(delay=delay, value=amount) value = round(value) if value > 0: - globals.global_demand.put(value) - logging.getLogger("general").info(str(round(env.now)), {"user_demand_new": value}) + simulator.global_demand.put(value) + logging.getLogger("general").info(str(round(simulator.env.now)), {"user_demand_new": value}) # print("[demand] raising user demand for %f at %d to %d" % (value, env.now, globals.global_demand.level)) From 1f3483499eed609a8de26f3193104abb0f3396eb Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Sat, 27 Oct 2018 21:32:58 +0200 Subject: [PATCH 071/648] added specific simulator to start actual simulation --- cobald_sim/scheduler.py | 20 ++++++++--------- cobald_sim/simulator.py | 39 ++++++++++++++++++++++++++++++++ main.py | 50 ++++++++++++++++++++++++----------------- 3 files changed, 78 insertions(+), 31 deletions(-) create mode 100644 cobald_sim/simulator.py diff --git a/cobald_sim/scheduler.py b/cobald_sim/scheduler.py index f30bdad..863845a 100644 --- a/cobald_sim/scheduler.py +++ b/cobald_sim/scheduler.py @@ -1,15 +1,12 @@ -from cobald_sim import globals - - # TODO: does not work anymore as there is no method get_drone at pool -def job_scheduler(env): +def job_scheduler(simulator): while True: - for pool in globals.pools: - while pool.level > 0 and globals.global_demand.level > 0: + for pool in simulator.pools: + while pool.level > 0 and simulator.global_demand.level > 0: drone = yield from pool.get_drone(1) - env.process(drone.start_job(*next(globals.job_generator))) - yield env.timeout(0) - yield env.timeout(1) + simulator.env.process(drone.start_job(*next(simulator.job_generator))) + yield simulator.env.timeout(0) + yield simulator.env.timeout(1) class CondorJobScheduler(object): @@ -25,9 +22,10 @@ class CondorJobScheduler(object): :param env: :return: """ - def __init__(self, env, job_queue): + def __init__(self, env, job_queue, pools): self.env = env self.job_queue = job_queue + self.pools = pools self.action = env.process(self.run()) def run(self): @@ -44,7 +42,7 @@ def run(self): def _schedule_job(self, job): priorities = {} - for pool in globals.pools: + for pool in self.pools: for drone in pool.drones: cost = 0 resource_types = {*drone.resources.keys(), *job.resources.keys()} diff --git a/cobald_sim/simulator.py b/cobald_sim/simulator.py new file mode 100644 index 0000000..313bac1 --- /dev/null +++ b/cobald_sim/simulator.py @@ -0,0 +1,39 @@ +import random +from functools import partial + +import simpy + +from cobald_sim.job import job_to_queue_scheduler +from cobald_sim.utility.monitor import monitor, trace + + +class Simulator(object): + def __init__(self, seed=1234): + random.seed(seed) + resource_normalisation = {"memory": 2000} + monitor_data = partial(monitor, resource_normalisation) + self.env = simpy.Environment() + self.job_queue = [] + self.pools = [] + self.job_input = None + self.job_scheduler = None + self.cost = 0 + trace(self.env, monitor_data, resource_normalisation=resource_normalisation, simulator=self) + + def create_job_generator(self, filename, job_reader): + self.job_input = open(filename, "r") + job_generator = job_to_queue_scheduler(job_generator=job_reader(self.env, self.job_input), + job_queue=self.job_queue, + env=self.env) + self.env.process(job_generator) + + def create_pools(self, filename, pool_reader, pool_type): + with open(filename, "r") as pool_input: + for pool in pool_reader(env=self.env, iterable=pool_input, pool_type=pool_type): + self.pools.append(pool) + + def create_scheduler(self, scheduler_type): + self.job_scheduler = scheduler_type(env=self.env, job_queue=self.job_queue, pools=self.pools) + + def run(self, until=2000): + self.env.run(until=until) diff --git a/main.py b/main.py index 1ccd973..f53619e 100644 --- a/main.py +++ b/main.py @@ -7,14 +7,14 @@ import matplotlib.pyplot as plt -from cobald_sim import globals - from cobald_sim.cost import cobald_cost from cobald_sim.job import job_to_queue_scheduler, Job from cobald_sim.scheduler import CondorJobScheduler from cobald_sim.pool import Pool, StaticPool from cobald_sim.controller import SimulatedCostController from cobald_sim.job_io.htcondor import htcondor_job_reader +from cobald_sim.pool_io.htcondor import htcondor_pool_reader +from cobald_sim.simulator import Simulator class JSONSocketHandler(logging.handlers.SocketHandler): @@ -202,24 +202,34 @@ def generate_plots(): def main(filename="condor_usage_sorted_filtered.csv", until=2000): - resource_normalisation = {"memory": 2000} - monitor_data = partial(monitor, resource_normalisation) - - random.seed(1234) - env = simpy.Environment() - trace(env, monitor_data, resource_normalisation=resource_normalisation) - with open(filename, "r") as input_file: - globals.job_generator = job_to_queue_scheduler(job_generator=htcondor_job_reader(env, input_file), - job_queue=globals.job_queue, - env=env) - env.process(globals.job_generator) - for resources in [{"memory": 5000, "cores": 1}, {"memory": 24000, "cores": 8}, {"memory": 16000, "cores": 4}]: - pool = Pool(env, resources=resources) - globals.pools.append(pool) - SimulatedCostController(env, target=pool, rate=1) - globals.pools.append(StaticPool(env, capacity=2)) - globals.job_scheduler = CondorJobScheduler(env=env, job_queue=globals.job_queue) - env.run(until=until) + simulator = Simulator() + simulator.create_job_generator(filename=filename, job_reader=htcondor_job_reader) + simulator.create_pools(filename="ekp_worker.csv", pool_reader=htcondor_pool_reader, pool_type=StaticPool) + simulator.create_scheduler(scheduler_type=CondorJobScheduler) + simulator.run(until=until) + # resource_normalisation = {"memory": 2000} + # monitor_data = partial(monitor, resource_normalisation) + + # random.seed(1234) + # env = simpy.Environment() + # trace(env, monitor_data, resource_normalisation=resource_normalisation) + # with open(filename, "r") as input_file: + # globals.job_generator = job_to_queue_scheduler(job_generator=htcondor_job_reader(simulator.env, input_file), + # job_queue=globals.job_queue, + # env=simulator.env) + # simulator.env.process(globals.job_generator) + # with open("ekp_worker.csv", "r") as pool_input: + # for pool in htcondor_pool_reader(env=simulator.env, iterable=pool_input, pool_type=StaticPool): + # globals.pools.append(pool) + # # for resources in [{"memory": 5000, "cores": 1}, {"memory": 24000, "cores": 8}, {"memory": 16000, "cores": 4}]: + # # pool = Pool(env, resources=resources) + # # globals.pools.append(pool) + # # SimulatedCostController(env, target=pool, rate=1) + # # globals.pools.append(StaticPool(env, capacity=2)) + # globals.job_scheduler = CondorJobScheduler(env=simulator.env, job_queue=globals.job_queue) + # # env.run(until=until) + for job in simulator.job_scheduler.job_queue: + print(job.resources, job.used_resources) if __name__ == "__main__": From db54934aa7602e1061c53416a135fbf39be829ea Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Sat, 27 Oct 2018 21:35:14 +0200 Subject: [PATCH 072/648] moved monitoring functionality to utilities, related to issue #22 --- cobald_sim/utility/__init__.py | 0 cobald_sim/utility/monitor.py | 78 ++++++++++++++++++++++++++++++++++ 2 files changed, 78 insertions(+) create mode 100644 cobald_sim/utility/__init__.py create mode 100644 cobald_sim/utility/monitor.py diff --git a/cobald_sim/utility/__init__.py b/cobald_sim/utility/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/cobald_sim/utility/monitor.py b/cobald_sim/utility/monitor.py new file mode 100644 index 0000000..c2490bc --- /dev/null +++ b/cobald_sim/utility/monitor.py @@ -0,0 +1,78 @@ +from functools import wraps + +import simpy +import logging + +from cobald_sim.cost import cobald_cost +from cobald_sim.job import Job + +last_step = 0 + + +def trace(env, callback, resource_normalisation, simulator): + def get_wrapper(env_step, callback): + @wraps(env_step) + def tracing_step(): + if len(env._queue): + t, prio, eid, event = env._queue[0] + callback(t, prio, eid, event, resource_normalisation, simulator) + return env_step() + return tracing_step + env.step = get_wrapper(env.step, callback) + + +def monitor(data, t, prio, eid, event, resource_normalisation, simulator): + if event.value: + if isinstance(event.value, simpy.exceptions.Interrupt): + job = event.value.cause + for resource_key, usage in job.used_resources.items(): + value = usage / job.resources[resource_key] + if value > 1: + logging.info(str(round(t)), {"job_exceeds_%s" % resource_key: value}) + if isinstance(event.value, Job): + logging.info(str(round(t)), {"job_waiting_times": event.value.waiting_time}) + global last_step + if t > last_step: + # new data to be recorded + tmp = round(t) + last_step = tmp + pool_demand = 0 + pool_supply = 0 + pool_utilisation = 0 + pool_allocation = 0 + running_jobs = 0 + used_resources = 0 + unused_resources = 0 + available_resources = 0 + empty_drones = 0 + result = {} + for pool in simulator.pools: + pool_demand += pool.demand + pool_supply += pool.supply + result["pool_%s_supply" % id(pool)] = pool.supply + pool_utilisation += pool.utilisation + pool_allocation += pool.allocation + for drone in pool.drones: + running_jobs += drone.jobs + if drone.allocation == 0: + empty_drones += 1 + for resource_key, usage in drone.resources.items(): + normalisation_factor = resource_normalisation.get(resource_key, 1) + used_resources += usage / normalisation_factor + unused_resources += (pool.resources[resource_key] - usage) / normalisation_factor + available_resources += pool.resources[resource_key] / normalisation_factor + result["user_demand"] = len(simulator.job_queue) + result["pool_demand"] = pool_demand + result["pool_supply"] = pool_supply + result["pool_utilisation"] = pool_utilisation + result["pool_allocation"] = pool_allocation + result["running_jobs"] = running_jobs + result["empty_drones"] = empty_drones + result["used_resources"] = used_resources + result["unused_resources"] = unused_resources + result["available_resources"] = available_resources + current_cost = cobald_cost(simulator) + result["cost"] = current_cost + simulator.cost += current_cost + result["acc_cost"] = simulator.cost + logging.info(str(tmp), result) From da77240338bb58c6c5a880aea22e424694d1aaef Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Sat, 27 Oct 2018 21:39:16 +0200 Subject: [PATCH 073/648] added gitignore --- .gitignore | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..0255a5d --- /dev/null +++ b/.gitignore @@ -0,0 +1,6 @@ +__pycache__/ +*.py[cod] + +.ipynb_checkpoints +.idea + From d1f1533cc34a10358e5b40d4b8623637cf36f25b Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Mon, 29 Oct 2018 11:07:40 +0100 Subject: [PATCH 074/648] added documentation of pools and simplified creation of static pool --- cobald_sim/pool.py | 32 ++++++++++++++++++++++++++------ 1 file changed, 26 insertions(+), 6 deletions(-) diff --git a/cobald_sim/pool.py b/cobald_sim/pool.py index 4afa3f7..26a54d6 100644 --- a/cobald_sim/pool.py +++ b/cobald_sim/pool.py @@ -5,14 +5,29 @@ class Pool(interfaces.Pool, container.Container): + """ + A pool encapsulating a number of pools or drones. Given a specific demand, allocation and utilisation, the + pool is able to adapt in terms of number of drones providing the given resources. + + :param env: Reference to the simulation env + :param capacity: Maximum number of pools that can be instantiated within the pool + :param init: Number of pools to instantiate at creation time of the pool + :param resources: Dictionary of resources available for each pool instantiated within the pool + """ def __init__(self, env, capacity=float('inf'), init=0, resources={"memory": 8000, "cores": 1}): - super(Pool, self).__init__(env, capacity, init) - self.resources = resources - self._demand = 1 + super(Pool, self).__init__(env, capacity) self._drones = [] self.env = env + self.resources = resources + self.init_pool(init=init) + self._demand = 1 self.action = env.process(self.run()) + def init_pool(self, init=0): + for _ in range(init): + self._drones.append(Drone(self.env, self.resources, 0)) + self.put(init) + def run(self): while True: drones_required = self._demand - self.level @@ -83,13 +98,18 @@ def demand(self, value): class StaticPool(Pool): + """ + A static pool does not react on changing conditions regarding demand, allocation and utilisation but instead + initialises the `capacity` of given drones with initialised `resources`. + + :param env: Reference to the simulation env + :param capacity: Maximum number of pools that can be instantiated within the pool + :param resources: Dictionary of resources available for each pool instantiated within the pool + """ def __init__(self, env, capacity=0, resources={"memory": 8000, "cores": 1}): assert capacity > 0, "Static pool was initialised without any resources..." super(StaticPool, self).__init__(env, capacity=capacity, init=capacity, resources=resources) self._demand = capacity - for _ in range(capacity): - self._drones.append(Drone(self.env, self.resources, 0)) - self.put(capacity) def run(self): while True: From 6c87abaa20e2e70e7275b61da6b736d4b8b52ad4 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Mon, 29 Oct 2018 11:09:40 +0100 Subject: [PATCH 075/648] initial value at simpy container is now directly set at initialisation time --- cobald_sim/pool.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cobald_sim/pool.py b/cobald_sim/pool.py index 26a54d6..a034b47 100644 --- a/cobald_sim/pool.py +++ b/cobald_sim/pool.py @@ -15,7 +15,7 @@ class Pool(interfaces.Pool, container.Container): :param resources: Dictionary of resources available for each pool instantiated within the pool """ def __init__(self, env, capacity=float('inf'), init=0, resources={"memory": 8000, "cores": 1}): - super(Pool, self).__init__(env, capacity) + super(Pool, self).__init__(env, capacity, init) self._drones = [] self.env = env self.resources = resources @@ -26,7 +26,6 @@ def __init__(self, env, capacity=float('inf'), init=0, resources={"memory": 8000 def init_pool(self, init=0): for _ in range(init): self._drones.append(Drone(self.env, self.resources, 0)) - self.put(init) def run(self): while True: From 93d8d21ab7da3a1ec1a6bc439a961e03351f39f3 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Mon, 29 Oct 2018 11:13:11 +0100 Subject: [PATCH 076/648] documentation of init_pool and run methods --- cobald_sim/pool.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/cobald_sim/pool.py b/cobald_sim/pool.py index a034b47..5f51fd2 100644 --- a/cobald_sim/pool.py +++ b/cobald_sim/pool.py @@ -24,10 +24,19 @@ def __init__(self, env, capacity=float('inf'), init=0, resources={"memory": 8000 self.action = env.process(self.run()) def init_pool(self, init=0): + """ + Initialisation of existing drones at creation time of pool. + + :param init: Number of drones to create. + """ for _ in range(init): self._drones.append(Drone(self.env, self.resources, 0)) def run(self): + """ + Pool periodically checks the current demand and provided drones. If demand is higher than the current level, + the pool takes care of initialising new drones. Otherwise drones get removed. + """ while True: drones_required = self._demand - self.level while drones_required > 0: @@ -111,5 +120,8 @@ def __init__(self, env, capacity=0, resources={"memory": 8000, "cores": 1}): self._demand = capacity def run(self): + """ + Pool runs forever and does not check if number of drones needs to be adapted. + """ while True: yield self.env.timeout(float("Inf")) From 54dd66c57e90edfa6a5da2b5629b354aa08ce6f0 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Mon, 29 Oct 2018 15:28:29 +0100 Subject: [PATCH 077/648] logging and simulator now available through cli, closes issue #24 --- cobald_sim/cli/simulate.py | 165 ++++++++----------------------------- cobald_sim/simulator.py | 15 ++-- 2 files changed, 40 insertions(+), 140 deletions(-) diff --git a/cobald_sim/cli/simulate.py b/cobald_sim/cli/simulate.py index f3b3961..5c9cdf1 100644 --- a/cobald_sim/cli/simulate.py +++ b/cobald_sim/cli/simulate.py @@ -1,22 +1,16 @@ -import random -from functools import partial, wraps - import click -import simpy import logging.handlers from cobald.monitor.format_json import JsonFormatter from cobald_sim.controller import SimulatedCostController -from cobald_sim.cost import cobald_cost -from cobald_sim.job import job_to_queue_scheduler from cobald_sim.job_io.htcondor import htcondor_job_reader from cobald_sim.pool import StaticPool, Pool from cobald_sim.pool_io.htcondor import htcondor_pool_reader from cobald_sim.job_io.swf import swf_job_reader -from cobald_sim import globals from cobald_sim.scheduler import CondorJobScheduler +from cobald_sim.simulator import Simulator class JSONSocketHandler(logging.handlers.SocketHandler): @@ -26,14 +20,6 @@ def makePickle(self, record): monitoring_logger = logging.getLogger("general") monitoring_logger.setLevel(logging.DEBUG) -socketHandler = JSONSocketHandler( - 'localhost', - logging.handlers.DEFAULT_TCP_LOGGING_PORT) -streamHandler = logging.StreamHandler() -socketHandler.setFormatter(JsonFormatter()) -streamHandler.setFormatter(JsonFormatter()) -monitoring_logger.addHandler(socketHandler) -monitoring_logger.addHandler(streamHandler) last_step = 0 @@ -47,83 +33,24 @@ def makePickle(self, record): } -def trace(env, callback, resource_normalisation): - def get_wrapper(env_step, callback): - @wraps(env_step) - def tracing_step(): - if len(env._queue): - t, prio, eid, event = env._queue[0] - callback(t, prio, eid, event, resource_normalisation) - return env_step() - return tracing_step - env.step = get_wrapper(env.step, callback) - - -def monitor(data, t, prio, eid, event, resource_normalisation): - if event.value: - if isinstance(event.value, simpy.exceptions.Interrupt): - job = event.value.cause - for resource_key, usage in job.used_resources.items(): - value = usage / job.resources[resource_key] - if value > 1: - monitoring_logger.info(str(round(t)), {"job_exceeds_%s" % resource_key: value}) - if isinstance(event.value, Job): - monitoring_logger.info(str(round(t)), {"job_waiting_times": event.value.waiting_time}) - global last_step - if t > last_step: - # new data to be recorded - tmp = round(t) - last_step = tmp - pool_demand = 0 - pool_supply = 0 - pool_utilisation = 0 - pool_allocation = 0 - running_jobs = 0 - used_resources = 0 - unused_resources = 0 - available_resources = 0 - empty_drones = 0 - result = {} - for pool in globals.pools: - pool_demand += pool.demand - pool_supply += pool.supply - result["pool_%s_supply" % id(pool)] = pool.supply - pool_utilisation += pool.utilisation - pool_allocation += pool.allocation - for drone in pool.drones: - running_jobs += drone.jobs - if drone.allocation == 0: - empty_drones += 1 - for resource_key, usage in drone.resources.items(): - normalisation_factor = resource_normalisation.get(resource_key, 1) - used_resources += usage / normalisation_factor - unused_resources += (pool.resources[resource_key] - usage) / normalisation_factor - available_resources += pool.resources[resource_key] / normalisation_factor - result["user_demand"] = len(globals.job_queue) - result["pool_demand"] = pool_demand - result["pool_supply"] = pool_supply - result["pool_utilisation"] = pool_utilisation - result["pool_allocation"] = pool_allocation - result["running_jobs"] = running_jobs - result["empty_drones"] = empty_drones - result["used_resources"] = used_resources - result["unused_resources"] = unused_resources - result["available_resources"] = available_resources - current_cost = cobald_cost() - result["cost"] = current_cost - globals.cost += current_cost - result["acc_cost"] = globals.cost - monitoring_logger.info(str(tmp), result) - - @click.group() @click.option("--seed", type=int, default=1234) @click.option("--until", default=2000) +@click.option("--log-tcp", "log_tcp", is_flag=True) +@click.option("--log-file", "log_file", type=click.File("w")) @click.pass_context -def cli(ctx, seed, until): +def cli(ctx, seed, until, log_tcp, log_file): ctx.ensure_object(dict) ctx.obj['seed'] = seed ctx.obj['until'] = until + if log_tcp: + socketHandler = JSONSocketHandler('localhost', logging.handlers.DEFAULT_TCP_LOGGING_PORT) + socketHandler.setFormatter(JsonFormatter()) + monitoring_logger.addHandler(socketHandler) + if log_file: + streamHandler = logging.StreamHandler(stream=log_file) + streamHandler.setFormatter(JsonFormatter()) + monitoring_logger.addHandler(streamHandler) @cli.command() @@ -132,23 +59,14 @@ def cli(ctx, seed, until): @click.pass_context def static(ctx, job_file, pool_file): click.echo("starting static environment") - random.seed(ctx.obj["seed"]) - resource_normalisation = {"memory": 2000} - monitor_data = partial(monitor, resource_normalisation) - - env = simpy.Environment() - trace(env, monitor_data, resource_normalisation=resource_normalisation) + simulator = Simulator(seed=ctx.obj["seed"]) file, file_type = job_file - globals.job_generator = job_to_queue_scheduler(job_generator=job_import_mapper[file_type](env, file), - job_queue=globals.job_queue, - env=env) + simulator.create_job_generator(job_input=file, job_reader=job_import_mapper[file_type]) for current_pool in pool_file: - file, file_type = current_pool - for pool in pool_import_mapper[file_type](env=env, iterable=file, pool_type=StaticPool): - globals.pools.append(pool) - env.process(globals.job_generator) - globals.job_scheduler = CondorJobScheduler(env=env, job_queue=globals.job_queue) - env.run(until=ctx.obj["until"]) + pool_file, pool_file_type = current_pool + simulator.create_pools(pool_input=pool_file, pool_reader=pool_import_mapper[pool_file_type], pool_type=StaticPool) + simulator.create_scheduler(scheduler_type=CondorJobScheduler) + simulator.run(until=ctx.obj["until"]) @cli.command() @@ -157,24 +75,18 @@ def static(ctx, job_file, pool_file): @click.pass_context def dynamic(ctx, job_file, pool_file): click.echo("starting dynamic environment") - random.seed(ctx.obj["seed"]) - resource_normalisation = {"memory": 2000} - monitor_data = partial(monitor, resource_normalisation) - - env = simpy.Environment() - trace(env, monitor_data, resource_normalisation=resource_normalisation) + simulator = Simulator(seed=ctx.obj["seed"]) file, file_type = job_file - globals.job_generator = job_to_queue_scheduler(job_generator=job_import_mapper[file_type](env, file), - job_queue=globals.job_queue, - env=env) + simulator.create_job_generator(job_input=file, job_reader=job_import_mapper[file_type]) for current_pool in pool_file: file, file_type = current_pool - for pool in pool_import_mapper[file_type](env=env, iterable=file, pool_type=Pool): - globals.pools.append(pool) - SimulatedCostController(env, target=pool, rate=1) - env.process(globals.job_generator) - globals.job_scheduler = CondorJobScheduler(env=env, job_queue=globals.job_queue) - env.run(until=ctx.obj["until"]) + simulator.create_pools( + pool_input=file, + pool_reader=pool_import_mapper[file_type], + pool_type=Pool, + controller=SimulatedCostController) + simulator.create_scheduler(scheduler_type=CondorJobScheduler) + simulator.run(until=ctx.obj["until"]) @cli.command() @@ -184,28 +96,17 @@ def dynamic(ctx, job_file, pool_file): @click.pass_context def hybrid(ctx, job_file, static_pool_file, dynamic_pool_file): click.echo("starting hybrid environment") - random.seed(ctx.obj["seed"]) - resource_normalisation = {"memory": 2000} - monitor_data = partial(monitor, resource_normalisation) - - env = simpy.Environment() - trace(env, monitor_data, resource_normalisation=resource_normalisation) + simulator = Simulator(seed=ctx.obj["seed"]) file, file_type = job_file - globals.job_generator = job_to_queue_scheduler(job_generator=job_import_mapper[file_type](env, file), - job_queue=globals.job_queue, - env=env) + simulator.create_job_generator(job_input=file, job_reader=job_import_mapper[file_type]) for current_pool in static_pool_file: file, file_type = current_pool - for pool in pool_import_mapper[file_type](env=env, iterable=file, pool_type=StaticPool): - globals.pools.append(pool) + simulator.create_pools(pool_input=file, pool_reader=pool_import_mapper[file_type], pool_type=StaticPool) for current_pool in dynamic_pool_file: file, file_type = current_pool - for pool in pool_import_mapper[file_type](env=env, iterable=file, pool_type=Pool): - globals.pools.append(pool) - SimulatedCostController(env, target=pool, rate=1) - env.process(globals.job_generator) - globals.job_scheduler = CondorJobScheduler(env=env, job_queue=globals.job_queue) - env.run(until=ctx.obj["until"]) + simulator.create_pools(pool_input=file, pool_reader=pool_import_mapper[file_type], pool_type=Pool, controller=SimulatedCostController) + simulator.create_scheduler(scheduler_type=CondorJobScheduler) + simulator.run(until=ctx.obj["until"]) if __name__ == '__main__': diff --git a/cobald_sim/simulator.py b/cobald_sim/simulator.py index 313bac1..35dae6d 100644 --- a/cobald_sim/simulator.py +++ b/cobald_sim/simulator.py @@ -15,22 +15,21 @@ def __init__(self, seed=1234): self.env = simpy.Environment() self.job_queue = [] self.pools = [] - self.job_input = None self.job_scheduler = None self.cost = 0 trace(self.env, monitor_data, resource_normalisation=resource_normalisation, simulator=self) - def create_job_generator(self, filename, job_reader): - self.job_input = open(filename, "r") - job_generator = job_to_queue_scheduler(job_generator=job_reader(self.env, self.job_input), + def create_job_generator(self, job_input, job_reader): + job_generator = job_to_queue_scheduler(job_generator=job_reader(self.env, job_input), job_queue=self.job_queue, env=self.env) self.env.process(job_generator) - def create_pools(self, filename, pool_reader, pool_type): - with open(filename, "r") as pool_input: - for pool in pool_reader(env=self.env, iterable=pool_input, pool_type=pool_type): - self.pools.append(pool) + def create_pools(self, pool_input, pool_reader, pool_type, controller=None): + for pool in pool_reader(env=self.env, iterable=pool_input, pool_type=pool_type): + self.pools.append(pool) + if controller: + controller(self.env, target=pool, rate=1) def create_scheduler(self, scheduler_type): self.job_scheduler = scheduler_type(env=self.env, job_queue=self.job_queue, pools=self.pools) From 613e1f1a58b9fb40e6daac5708463d84a4366f15 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Mon, 29 Oct 2018 15:43:59 +0100 Subject: [PATCH 078/648] removed main from project --- main.py | 236 -------------------------------------------------------- 1 file changed, 236 deletions(-) delete mode 100644 main.py diff --git a/main.py b/main.py deleted file mode 100644 index f53619e..0000000 --- a/main.py +++ /dev/null @@ -1,236 +0,0 @@ -from functools import partial, wraps -import simpy -import random -import logging.handlers - -from cobald.monitor.format_json import JsonFormatter - -import matplotlib.pyplot as plt - -from cobald_sim.cost import cobald_cost -from cobald_sim.job import job_to_queue_scheduler, Job -from cobald_sim.scheduler import CondorJobScheduler -from cobald_sim.pool import Pool, StaticPool -from cobald_sim.controller import SimulatedCostController -from cobald_sim.job_io.htcondor import htcondor_job_reader -from cobald_sim.pool_io.htcondor import htcondor_pool_reader -from cobald_sim.simulator import Simulator - - -class JSONSocketHandler(logging.handlers.SocketHandler): - def makePickle(self, record): - return self.format(record).encode() - - -monitoring_logger = logging.getLogger("general") -monitoring_logger.setLevel(logging.DEBUG) -socketHandler = JSONSocketHandler( - 'localhost', - logging.handlers.DEFAULT_TCP_LOGGING_PORT) -streamHandler = logging.StreamHandler() -socketHandler.setFormatter(JsonFormatter()) -streamHandler.setFormatter(JsonFormatter()) -monitoring_logger.addHandler(socketHandler) -monitoring_logger.addHandler(streamHandler) - - -def trace(env, callback, resource_normalisation): - def get_wrapper(env_step, callback): - @wraps(env_step) - def tracing_step(): - if len(env._queue): - t, prio, eid, event = env._queue[0] - callback(t, prio, eid, event, resource_normalisation) - return env_step() - return tracing_step - env.step = get_wrapper(env.step, callback) - - -last_step = 0 - - -def monitor(data, t, prio, eid, event, resource_normalisation): - if event.value: - if isinstance(event.value, simpy.exceptions.Interrupt): - job = event.value.cause - for resource_key, usage in job.used_resources.items(): - value = usage / job.resources[resource_key] - if value > 1: - monitoring_logger.info(str(round(t)), {"job_exceeds_%s" % resource_key: value}) - if isinstance(event.value, Job): - monitoring_logger.info(str(round(t)), {"job_waiting_times": event.value.waiting_time}) - global last_step - if t > last_step: - # new data to be recorded - tmp = round(t) - last_step = tmp - pool_demand = 0 - pool_supply = 0 - pool_utilisation = 0 - pool_allocation = 0 - running_jobs = 0 - used_resources = 0 - unused_resources = 0 - available_resources = 0 - empty_drones = 0 - result = {} - for pool in globals.pools: - pool_demand += pool.demand - pool_supply += pool.supply - result["pool_%s_supply" % id(pool)] = pool.supply - pool_utilisation += pool.utilisation - pool_allocation += pool.allocation - for drone in pool.drones: - running_jobs += drone.jobs - if drone.allocation == 0: - empty_drones += 1 - for resource_key, usage in drone.resources.items(): - normalisation_factor = resource_normalisation.get(resource_key, 1) - used_resources += usage / normalisation_factor - unused_resources += (pool.resources[resource_key] - usage) / normalisation_factor - available_resources += pool.resources[resource_key] / normalisation_factor - result["user_demand"] = len(globals.job_queue) - result["pool_demand"] = pool_demand - result["pool_supply"] = pool_supply - result["pool_utilisation"] = pool_utilisation - result["pool_allocation"] = pool_allocation - result["running_jobs"] = running_jobs - result["empty_drones"] = empty_drones - result["used_resources"] = used_resources - result["unused_resources"] = unused_resources - result["available_resources"] = available_resources - current_cost = cobald_cost() - result["cost"] = current_cost - globals.cost += current_cost - result["acc_cost"] = globals.cost - monitoring_logger.info(str(tmp), result) - - -def generate_plots(): - # Plotting some first results - keys = globals.monitoring_data["timesteps"].keys() - values = globals.monitoring_data["timesteps"].values() - plt.plot(keys, - [value.get("user_demand", None) for value in values], - label="Accumulated demand") - plt.plot(keys, - [value.get("user_demand_new", None) for value in values], - 'ro', - label="Current demand") - plt.plot(keys, - [value.get("pool_demand", None) for value in values], - label="Pool demand") - plt.plot(keys, - [value.get("pool_supply", None) for value in values], - label="Pool supply") - plt.plot(keys, - [value.get("running_jobs", None) for value in values], - label="Running jobs") - plt.legend() - plt.show() - plt.plot(keys, - [value.get("pool_utilisation", None) for value in values], - label="Pool utilisation") - plt.plot(keys, - [value.get("pool_allocation", None) for value in values], - label="Pool allocation") - plt.plot(keys, - [value.get("empty_drones", None) for value in values], - label="Unallocated drones") - plt.legend() - plt.show() - - for index, pool in enumerate(globals.pools): - print("pool", index, "has", pool.resources) - plt.plot(keys, - [value.get("pool_%s_supply" % pool, None) for value in values], - label="Pool %d supply" % index) - plt.legend() - plt.show() - - fig, ax1 = plt.subplots() - ax1.plot(keys, - [value.get("cost", None) for value in values], 'b-') - ax1.set_xlabel('Time') - # Make the y-axis label, ticks and tick labels match the line color. - ax1.set_ylabel('Cost', color='b') - ax1.tick_params('y', colors='b') - - ax2 = ax1.twinx() - ax2.plot(keys, - [value.get("acc_cost", None) for value in values], 'r.') - ax2.set_ylabel('Accumulated Cost', color='r') - ax2.tick_params('y', colors='r') - - fig.tight_layout() - plt.show() - - # resource plot for max - fig, ax = plt.subplots(2, sharex=True) - ax[0].plot(keys, - [value.get("unused_resources", None) for value in values], - label="Unused") - ax[0].plot(keys, - [value.get("used_resources", None) for value in values], - label="Used") - ax[0].set_title("Resource utilisation") - ax[0].legend() - percentages = [] - percentage_means = [] - for value in values: - try: - percentages.append(value.get("unused_resources", 0) / value.get("available_resources", 0)) - except ZeroDivisionError: - percentages.append(1) - percentage_means.append(sum(percentages) / len(percentages)) - ax[1].plot(keys, percentages) - ax[1].plot(keys, percentage_means, label="mean") - ax[1].set_title("Percentage of unused resources") - fig.show() - - # waiting time histogram - plt.hist(globals.monitoring_data["job_waiting_times"], label="Job waiting times") - plt.legend() - plt.show() - - for resource_key in [key for key in globals.monitoring_data.keys() if - isinstance(key, str) and key.startswith("job_exceeds_")]: - plt.hist(globals.monitoring_data[resource_key], label="Job exceeding %s" % - resource_key.replace("job_exceeds_", "")) - plt.legend() - plt.show() - - -def main(filename="condor_usage_sorted_filtered.csv", until=2000): - simulator = Simulator() - simulator.create_job_generator(filename=filename, job_reader=htcondor_job_reader) - simulator.create_pools(filename="ekp_worker.csv", pool_reader=htcondor_pool_reader, pool_type=StaticPool) - simulator.create_scheduler(scheduler_type=CondorJobScheduler) - simulator.run(until=until) - # resource_normalisation = {"memory": 2000} - # monitor_data = partial(monitor, resource_normalisation) - - # random.seed(1234) - # env = simpy.Environment() - # trace(env, monitor_data, resource_normalisation=resource_normalisation) - # with open(filename, "r") as input_file: - # globals.job_generator = job_to_queue_scheduler(job_generator=htcondor_job_reader(simulator.env, input_file), - # job_queue=globals.job_queue, - # env=simulator.env) - # simulator.env.process(globals.job_generator) - # with open("ekp_worker.csv", "r") as pool_input: - # for pool in htcondor_pool_reader(env=simulator.env, iterable=pool_input, pool_type=StaticPool): - # globals.pools.append(pool) - # # for resources in [{"memory": 5000, "cores": 1}, {"memory": 24000, "cores": 8}, {"memory": 16000, "cores": 4}]: - # # pool = Pool(env, resources=resources) - # # globals.pools.append(pool) - # # SimulatedCostController(env, target=pool, rate=1) - # # globals.pools.append(StaticPool(env, capacity=2)) - # globals.job_scheduler = CondorJobScheduler(env=simulator.env, job_queue=globals.job_queue) - # # env.run(until=until) - for job in simulator.job_scheduler.job_queue: - print(job.resources, job.used_resources) - - -if __name__ == "__main__": - main() From e0265f9448fae01c6e1eca0787249775bacfe303 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Mon, 29 Oct 2018 16:05:43 +0100 Subject: [PATCH 079/648] renamed underscore to dash in cli interface --- cobald_sim/cli/simulate.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/cobald_sim/cli/simulate.py b/cobald_sim/cli/simulate.py index 5c9cdf1..f72c782 100644 --- a/cobald_sim/cli/simulate.py +++ b/cobald_sim/cli/simulate.py @@ -54,8 +54,8 @@ def cli(ctx, seed, until, log_tcp, log_file): @cli.command() -@click.option("--job_file", type=(click.File("r"), click.Choice(list(job_import_mapper.keys())))) -@click.option("--pool_file", type=(click.File("r"), click.Choice(list(pool_import_mapper.keys()))), multiple=True) +@click.option("--job-file", "job_file", type=(click.File("r"), click.Choice(list(job_import_mapper.keys())))) +@click.option("--pool-file", "pool_file", type=(click.File("r"), click.Choice(list(pool_import_mapper.keys()))), multiple=True) @click.pass_context def static(ctx, job_file, pool_file): click.echo("starting static environment") @@ -70,8 +70,8 @@ def static(ctx, job_file, pool_file): @cli.command() -@click.option("--job_file", type=(click.File("r"), click.Choice(list(job_import_mapper.keys())))) -@click.option("--pool_file", type=(click.File("r"), click.Choice(list(pool_import_mapper.keys()))), multiple=True) +@click.option("--job-file", "job_file", type=(click.File("r"), click.Choice(list(job_import_mapper.keys())))) +@click.option("--pool-file", "pool_file", type=(click.File("r"), click.Choice(list(pool_import_mapper.keys()))), multiple=True) @click.pass_context def dynamic(ctx, job_file, pool_file): click.echo("starting dynamic environment") @@ -90,9 +90,9 @@ def dynamic(ctx, job_file, pool_file): @cli.command() -@click.option("--job_file", type=(click.File("r"), click.Choice(list(job_import_mapper.keys())))) -@click.option("--static_pool_file", type=(click.File("r"), click.Choice(list(pool_import_mapper.keys()))), multiple=True) -@click.option("--dynamic_pool_file", type=(click.File("r"), click.Choice(list(pool_import_mapper.keys()))), multiple=True) +@click.option("--job-file", "job_file", type=(click.File("r"), click.Choice(list(job_import_mapper.keys())))) +@click.option("--static-pool-file", "static_pool_file", type=(click.File("r"), click.Choice(list(pool_import_mapper.keys()))), multiple=True) +@click.option("--dynamic-pool-file", "dynamic_pool_file", type=(click.File("r"), click.Choice(list(pool_import_mapper.keys()))), multiple=True) @click.pass_context def hybrid(ctx, job_file, static_pool_file, dynamic_pool_file): click.echo("starting hybrid environment") From cc39424bb44df6c2bcd382306caf01548f162b65 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Mon, 29 Oct 2018 16:31:09 +0100 Subject: [PATCH 080/648] added basic description of library --- README.rst | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) create mode 100644 README.rst diff --git a/README.rst b/README.rst new file mode 100644 index 0000000..0b9ea3d --- /dev/null +++ b/README.rst @@ -0,0 +1,35 @@ +========================================= +COBalD_Sim - this is just a working title +========================================= + +The ``cobald_sim`` library provides a framework and runtime for simulating the scheduling and usage of opportunistic +and static resources. + +Command Line Interface +---------------------- + +Currently the library provides a simple command line interface that allows three modes of operation: + +* static provisioning of resources, +* dynamic provisioning of resources, and +* hybrid provisioning of resources. + +In the most simple case you can apply a given workload, e.g. downloaded from the parallel workload archive to a +static resource configuration: + + +.. code:: bash + + python3 simulate.py --log-file - static --job-file swf --pool-file htcondor + +The output of simulation is given to stdout. You have further options you can explore via + +.. code:: bash + + python3 simulate.py --help + +and more specifically for the different operation modes with + +.. code:: bash + + python3 simulate.py static --help From 2bc68586b871e019eaa216c7b65f8ee35d583ebf Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Mon, 29 Oct 2018 16:51:08 +0100 Subject: [PATCH 081/648] introduced name for pools and jobs --- cobald_sim/job.py | 3 ++- cobald_sim/pool.py | 3 ++- cobald_sim/utility/monitor.py | 2 +- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/cobald_sim/job.py b/cobald_sim/job.py index c97d068..8ab31b4 100644 --- a/cobald_sim/job.py +++ b/cobald_sim/job.py @@ -34,7 +34,7 @@ def job_demand(simulator): class Job(object): - def __init__(self, env, resources, used_resources=None, in_queue_since=0, queue_date=0): + def __init__(self, env, resources, used_resources=None, in_queue_since=0, queue_date=0, name=None): self.env = env self.resources = resources self.used_resources = used_resources @@ -45,6 +45,7 @@ def __init__(self, env, resources, used_resources=None, in_queue_since=0, queue_ self.in_queue_since = in_queue_since self.in_queue_until = None self.processing = None + self.name = name or id(self) @property def waiting_time(self): diff --git a/cobald_sim/pool.py b/cobald_sim/pool.py index 5f51fd2..c360ee9 100644 --- a/cobald_sim/pool.py +++ b/cobald_sim/pool.py @@ -14,13 +14,14 @@ class Pool(interfaces.Pool, container.Container): :param init: Number of pools to instantiate at creation time of the pool :param resources: Dictionary of resources available for each pool instantiated within the pool """ - def __init__(self, env, capacity=float('inf'), init=0, resources={"memory": 8000, "cores": 1}): + def __init__(self, env, capacity=float('inf'), init=0, resources={"memory": 8000, "cores": 1}, name=None): super(Pool, self).__init__(env, capacity, init) self._drones = [] self.env = env self.resources = resources self.init_pool(init=init) self._demand = 1 + self.name = name or id(self) self.action = env.process(self.run()) def init_pool(self, init=0): diff --git a/cobald_sim/utility/monitor.py b/cobald_sim/utility/monitor.py index c2490bc..0a97371 100644 --- a/cobald_sim/utility/monitor.py +++ b/cobald_sim/utility/monitor.py @@ -49,7 +49,7 @@ def monitor(data, t, prio, eid, event, resource_normalisation, simulator): for pool in simulator.pools: pool_demand += pool.demand pool_supply += pool.supply - result["pool_%s_supply" % id(pool)] = pool.supply + result["pool_%s_supply" % pool.name] = pool.supply pool_utilisation += pool.utilisation pool_allocation += pool.allocation for drone in pool.drones: From b5706425be7c3a5161fd7b007b2fdd41650aafed Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Mon, 29 Oct 2018 17:21:12 +0100 Subject: [PATCH 082/648] added name for job from swf job import --- cobald_sim/job_io/swf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cobald_sim/job_io/swf.py b/cobald_sim/job_io/swf.py index babf59c..3f88c68 100644 --- a/cobald_sim/job_io/swf.py +++ b/cobald_sim/job_io/swf.py @@ -46,4 +46,4 @@ def swf_job_reader(env, iterable, resource_name_mapping={ key: float(row[header[used_resource_name_mapping[key]]]) for key in used_resource_name_mapping.keys() if float(row[header[used_resource_name_mapping[key]]]) >= 0 - }, queue_date=float(row[header[used_resource_name_mapping["queuetime"]]])) + }, queue_date=float(row[header[used_resource_name_mapping["queuetime"]]]), name=row[header["Job Number"]]) From a0b3f942496bc7d901ecf4ae6ddab02bbf3aa4ff Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Fri, 16 Nov 2018 16:12:41 +0100 Subject: [PATCH 083/648] renamed project to lapis --- README.rst | 8 ++++---- {cobald_sim => lapis}/__init__.py | 0 {cobald_sim => lapis}/cli/__init__.py | 0 {cobald_sim => lapis}/cli/simulate.py | 16 ++++++++-------- {cobald_sim => lapis}/controller.py | 0 {cobald_sim => lapis}/cost.py | 0 {cobald_sim => lapis}/drone.py | 0 {cobald_sim => lapis}/job.py | 7 ++++--- {cobald_sim => lapis}/job_io/__init__.py | 0 {cobald_sim => lapis}/job_io/htcondor.py | 2 +- {cobald_sim => lapis}/job_io/swf.py | 2 +- {cobald_sim => lapis}/pool.py | 0 {cobald_sim => lapis}/pool_io/__init__.py | 0 {cobald_sim => lapis}/pool_io/htcondor.py | 0 {cobald_sim => lapis}/scheduler.py | 0 {cobald_sim => lapis}/simulator.py | 5 +++-- {cobald_sim => lapis}/utility/__init__.py | 0 {cobald_sim => lapis}/utility/monitor.py | 4 ++-- 18 files changed, 23 insertions(+), 21 deletions(-) rename {cobald_sim => lapis}/__init__.py (100%) rename {cobald_sim => lapis}/cli/__init__.py (100%) rename {cobald_sim => lapis}/cli/simulate.py (91%) rename {cobald_sim => lapis}/controller.py (100%) rename {cobald_sim => lapis}/cost.py (100%) rename {cobald_sim => lapis}/drone.py (100%) rename {cobald_sim => lapis}/job.py (92%) rename {cobald_sim => lapis}/job_io/__init__.py (100%) rename {cobald_sim => lapis}/job_io/htcondor.py (97%) rename {cobald_sim => lapis}/job_io/swf.py (98%) rename {cobald_sim => lapis}/pool.py (100%) rename {cobald_sim => lapis}/pool_io/__init__.py (100%) rename {cobald_sim => lapis}/pool_io/htcondor.py (100%) rename {cobald_sim => lapis}/scheduler.py (100%) rename {cobald_sim => lapis}/simulator.py (91%) rename {cobald_sim => lapis}/utility/__init__.py (100%) rename {cobald_sim => lapis}/utility/monitor.py (97%) diff --git a/README.rst b/README.rst index 0b9ea3d..ba0b89c 100644 --- a/README.rst +++ b/README.rst @@ -1,8 +1,8 @@ -========================================= -COBalD_Sim - this is just a working title -========================================= +=============================================================================== +Lapis is an adaptable, performant, and interactive scheduling (Lapis) simulator +=============================================================================== -The ``cobald_sim`` library provides a framework and runtime for simulating the scheduling and usage of opportunistic +The ``lapis`` library provides a framework and runtime for simulating the scheduling and usage of opportunistic and static resources. Command Line Interface diff --git a/cobald_sim/__init__.py b/lapis/__init__.py similarity index 100% rename from cobald_sim/__init__.py rename to lapis/__init__.py diff --git a/cobald_sim/cli/__init__.py b/lapis/cli/__init__.py similarity index 100% rename from cobald_sim/cli/__init__.py rename to lapis/cli/__init__.py diff --git a/cobald_sim/cli/simulate.py b/lapis/cli/simulate.py similarity index 91% rename from cobald_sim/cli/simulate.py rename to lapis/cli/simulate.py index f72c782..d973c7a 100644 --- a/cobald_sim/cli/simulate.py +++ b/lapis/cli/simulate.py @@ -3,14 +3,14 @@ from cobald.monitor.format_json import JsonFormatter -from cobald_sim.controller import SimulatedCostController -from cobald_sim.job_io.htcondor import htcondor_job_reader -from cobald_sim.pool import StaticPool, Pool -from cobald_sim.pool_io.htcondor import htcondor_pool_reader -from cobald_sim.job_io.swf import swf_job_reader +from lapis.controller import SimulatedCostController +from lapis.job_io.htcondor import htcondor_job_reader +from lapis.pool import StaticPool, Pool +from lapis.pool_io.htcondor import htcondor_pool_reader +from lapis.job_io.swf import swf_job_reader -from cobald_sim.scheduler import CondorJobScheduler -from cobald_sim.simulator import Simulator +from lapis.scheduler import CondorJobScheduler +from lapis.simulator import Simulator class JSONSocketHandler(logging.handlers.SocketHandler): @@ -18,7 +18,7 @@ def makePickle(self, record): return self.format(record).encode() -monitoring_logger = logging.getLogger("general") +monitoring_logger = logging.getLogger() monitoring_logger.setLevel(logging.DEBUG) last_step = 0 diff --git a/cobald_sim/controller.py b/lapis/controller.py similarity index 100% rename from cobald_sim/controller.py rename to lapis/controller.py diff --git a/cobald_sim/cost.py b/lapis/cost.py similarity index 100% rename from cobald_sim/cost.py rename to lapis/cost.py diff --git a/cobald_sim/drone.py b/lapis/drone.py similarity index 100% rename from cobald_sim/drone.py rename to lapis/drone.py diff --git a/cobald_sim/job.py b/lapis/job.py similarity index 92% rename from cobald_sim/job.py rename to lapis/job.py index 8ab31b4..5d77de0 100644 --- a/cobald_sim/job.py +++ b/lapis/job.py @@ -29,7 +29,7 @@ def job_demand(simulator): value = round(value) if value > 0: simulator.global_demand.put(value) - logging.getLogger("general").info(str(round(simulator.env.now)), {"user_demand_new": value}) + logging.info(str(round(simulator.env.now)), {"user_demand_new": value}) # print("[demand] raising user demand for %f at %d to %d" % (value, env.now, globals.global_demand.level)) @@ -60,7 +60,8 @@ def process(self): def _process(self): try: - yield self.env.timeout(self.requested_walltime or self.walltime, value=self) + yield self.env.timeout(0, value=self) + yield self.env.timeout(self.requested_walltime or self.walltime) except simpy.exceptions.Interrupt: pass @@ -91,6 +92,6 @@ def job_to_queue_scheduler(job_generator, job_queue, env=None, **kwargs): job = None else: if count > 0: - logging.getLogger("general").info(str(round(env.now)), {"user_demand_new": count}) + logging.info(str(round(env.now)), {"user_demand_new": count}) count = 0 yield env.timeout(1) diff --git a/cobald_sim/job_io/__init__.py b/lapis/job_io/__init__.py similarity index 100% rename from cobald_sim/job_io/__init__.py rename to lapis/job_io/__init__.py diff --git a/cobald_sim/job_io/htcondor.py b/lapis/job_io/htcondor.py similarity index 97% rename from cobald_sim/job_io/htcondor.py rename to lapis/job_io/htcondor.py index 35e4bee..2ab2173 100644 --- a/cobald_sim/job_io/htcondor.py +++ b/lapis/job_io/htcondor.py @@ -1,6 +1,6 @@ import csv -from cobald_sim.job import Job +from lapis.job import Job def htcondor_job_reader(env, iterable, resource_name_mapping={ diff --git a/cobald_sim/job_io/swf.py b/lapis/job_io/swf.py similarity index 98% rename from cobald_sim/job_io/swf.py rename to lapis/job_io/swf.py index 3f88c68..21e63be 100644 --- a/cobald_sim/job_io/swf.py +++ b/lapis/job_io/swf.py @@ -1,6 +1,6 @@ import csv -from cobald_sim.job import Job +from lapis.job import Job def swf_job_reader(env, iterable, resource_name_mapping={ diff --git a/cobald_sim/pool.py b/lapis/pool.py similarity index 100% rename from cobald_sim/pool.py rename to lapis/pool.py diff --git a/cobald_sim/pool_io/__init__.py b/lapis/pool_io/__init__.py similarity index 100% rename from cobald_sim/pool_io/__init__.py rename to lapis/pool_io/__init__.py diff --git a/cobald_sim/pool_io/htcondor.py b/lapis/pool_io/htcondor.py similarity index 100% rename from cobald_sim/pool_io/htcondor.py rename to lapis/pool_io/htcondor.py diff --git a/cobald_sim/scheduler.py b/lapis/scheduler.py similarity index 100% rename from cobald_sim/scheduler.py rename to lapis/scheduler.py diff --git a/cobald_sim/simulator.py b/lapis/simulator.py similarity index 91% rename from cobald_sim/simulator.py rename to lapis/simulator.py index 35dae6d..484e6be 100644 --- a/cobald_sim/simulator.py +++ b/lapis/simulator.py @@ -3,8 +3,8 @@ import simpy -from cobald_sim.job import job_to_queue_scheduler -from cobald_sim.utility.monitor import monitor, trace +from lapis.job import job_to_queue_scheduler +from lapis.utility.monitor import monitor, trace class Simulator(object): @@ -35,4 +35,5 @@ def create_scheduler(self, scheduler_type): self.job_scheduler = scheduler_type(env=self.env, job_queue=self.job_queue, pools=self.pools) def run(self, until=2000): + print("running until", until) self.env.run(until=until) diff --git a/cobald_sim/utility/__init__.py b/lapis/utility/__init__.py similarity index 100% rename from cobald_sim/utility/__init__.py rename to lapis/utility/__init__.py diff --git a/cobald_sim/utility/monitor.py b/lapis/utility/monitor.py similarity index 97% rename from cobald_sim/utility/monitor.py rename to lapis/utility/monitor.py index 0a97371..0340174 100644 --- a/cobald_sim/utility/monitor.py +++ b/lapis/utility/monitor.py @@ -3,8 +3,8 @@ import simpy import logging -from cobald_sim.cost import cobald_cost -from cobald_sim.job import Job +from lapis.cost import cobald_cost +from lapis.job import Job last_step = 0 From a31544cbaad49e57b74595e96619d8f1f5a411fd Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Fri, 14 Dec 2018 15:19:03 +0100 Subject: [PATCH 084/648] added test for import of htcondor --- lapis_tests/__init__.py | 0 lapis_tests/data/htcondor_jobs.csv | 18 ++++++++++++++++++ lapis_tests/job_io/__init__.py | 0 lapis_tests/job_io/test_htcondor.py | 11 +++++++++++ 4 files changed, 29 insertions(+) create mode 100644 lapis_tests/__init__.py create mode 100644 lapis_tests/data/htcondor_jobs.csv create mode 100644 lapis_tests/job_io/__init__.py create mode 100644 lapis_tests/job_io/test_htcondor.py diff --git a/lapis_tests/__init__.py b/lapis_tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/lapis_tests/data/htcondor_jobs.csv b/lapis_tests/data/htcondor_jobs.csv new file mode 100644 index 0000000..74322b3 --- /dev/null +++ b/lapis_tests/data/htcondor_jobs.csv @@ -0,0 +1,18 @@ +Owner AcctGroup JobUniverse QDate JobStartDate CompletionDate RequestCpus RequestMemory MemoryUsage RequestDisk DiskUsage_RAW ExitCode ExitBySignal ExitSignal RequestWalltime RemoteWallClockTime RemoteSysCpu RemoteUserCpu +2648c1 b387f8 5 1526661075 1526662331 0 1 2100 1 12500000 16173 0 False None 12000 88.0 0.0 0.0 +2648c1 b387f8 5 1526661075 1526662330 0 1 2100 1 12500000 16173 0 False None 12000 89.0 0.0 0.0 +2648c1 b387f8 5 1526661075 1526662330 0 1 2100 1 12500000 16173 0 False None 12000 88.0 0.0 0.0 +64a794 461645.4a9963 5 1526756602 1526777752 1526990696 1 2000 419 750000 21226 0 False 1 18000 192431.0 1192.0 15670.0 +64a794 461645.4a9963 5 1526756602 1526773814 1526990324 1 2000 416 750000 17608 0 False 1 18000 192170.0 9.0 16445.0 +64a794 461645.4a9963 5 1526756602 1526777751 1526969234 1 2000 416 750000 17607 0 False None 18000 191462.0 7.0 16102.0 +64a794 461645.4a9963 5 1526818977 1526824229 0 1 2000 3 750000 17070 None True 1 18000 175518.0 0.0 0.0 +64a794 461645.4a9963 5 1526818977 1526824229 0 1 2000 3 750000 17070 None True 1 18000 175515.0 0.0 0.0 +64a794 461645.4a9963 5 1526818977 1526823684 0 1 2000 179 750000 17070 None True 1 18000 588.0 0.0 0.0 +64a794 461645.4a9963 5 1526818977 1526830748 1526991485 1 2000 422 750000 21226 0 False 1 18000 18263.0 1182.0 16464.0 +64a794 461645.4a9963 5 1526818977 1526832606 1526989449 1 2000 418 750000 17608 0 False 1 18000 16115.0 10.0 15577.0 +64a794 461645.4a9963 5 1526890855 1526901710 1526988260 1 2000 414 750000 17607 0 False 1 18000 14905.0 9.0 14391.0 +64a794 461645.4a9963 5 1526890855 1526914093 1526988257 1 2000 418 750000 17608 0 False 1 18000 14770.0 10.0 14390.0 +64a794 461645.4a9963 5 1526892396 1526946920 1526971057 1 2000 416 750000 17608 0 False 1 18000 16803.0 6.0 16506.0 +64a794 461645.4a9963 5 1526892396 1526947770 1526971057 1 2000 416 750000 17608 0 False 1 18000 15971.0 8.0 15667.0 +64a794 461645.4a9963 5 1526892396 1526936400 1526970952 1 2000 409 750000 17608 0 False None 18000 34552.0 13.0 16539.0 +64a794 461645.4a9963 5 1526892396 1526936309 1526970945 1 2000 413 750000 17606 0 False None 18000 34636.0 67.0 34206.0 diff --git a/lapis_tests/job_io/__init__.py b/lapis_tests/job_io/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/lapis_tests/job_io/test_htcondor.py b/lapis_tests/job_io/test_htcondor.py new file mode 100644 index 0000000..8c05266 --- /dev/null +++ b/lapis_tests/job_io/test_htcondor.py @@ -0,0 +1,11 @@ +from lapis.job_io.htcondor import htcondor_job_reader + + +class TestHtcondorJobReader(object): + def test_simple_read(self): + with open("../data/condor_usage_sorted_filtered-ek.csv") as input_file: + jobs = 0 + for job in htcondor_job_reader(None, input_file): + assert job is not None + jobs += 1 + assert jobs > 0 From cee3c9aa4136efbffe6c050d7b54cef9a4de6db2 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Fri, 14 Dec 2018 15:20:34 +0100 Subject: [PATCH 085/648] added check for wallclock time bigger 0 when importing htcondor jobs --- lapis/job_io/htcondor.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/lapis/job_io/htcondor.py b/lapis/job_io/htcondor.py index 2ab2173..8280466 100644 --- a/lapis/job_io/htcondor.py +++ b/lapis/job_io/htcondor.py @@ -1,4 +1,5 @@ import csv +import logging from lapis.job import Job @@ -18,6 +19,9 @@ def htcondor_job_reader(env, iterable, resource_name_mapping={ htcondor_reader = csv.DictReader(iterable, delimiter=' ', quotechar="'") for row in htcondor_reader: + if float(row[used_resource_name_mapping["walltime"]]) <= 0: + logging.getLogger("implementation").warning("removed job from htcondor import", row) + continue yield Job( env, resources={ From 9e6864e7a645fc84d298ce81a3388ed27196adc5 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Fri, 14 Dec 2018 16:47:26 +0100 Subject: [PATCH 086/648] corrected name of htcondor input file for test --- lapis_tests/job_io/test_htcondor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapis_tests/job_io/test_htcondor.py b/lapis_tests/job_io/test_htcondor.py index 8c05266..ed5b22d 100644 --- a/lapis_tests/job_io/test_htcondor.py +++ b/lapis_tests/job_io/test_htcondor.py @@ -3,7 +3,7 @@ class TestHtcondorJobReader(object): def test_simple_read(self): - with open("../data/condor_usage_sorted_filtered-ek.csv") as input_file: + with open("../data/htcondor_jobs.csv") as input_file: jobs = 0 for job in htcondor_job_reader(None, input_file): assert job is not None From 9d324d66a76cd75c0e82a89ea9cea57c4c3d370a Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Fri, 14 Dec 2018 16:47:46 +0100 Subject: [PATCH 087/648] added travis configuration --- .travis.yml | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) create mode 100644 .travis.yml diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 0000000..3da6e92 --- /dev/null +++ b/.travis.yml @@ -0,0 +1,22 @@ +language: python +python: + - "3.5" + - "3.6" + - "pypy3" + - "3.7-dev" +os: + - linux +# - osx # osx+python installation fails +matrix: + # ignore all but the most recent, stable releases + allow_failures: + - python: "3.7-dev" +before_script: + - export PYTHONHASHSEED=${PYTHONHASHSEED:-${RANDOM}} + - echo "export PYTHONHASHSEED=${PYTHONHASHSEED}" + - pip install codecov + - export COVERAGE_PROCESS_START=$(pwd)/.coveragerc +script: + - coverage run setup.py test +after_success: + - coverage report && codecov \ No newline at end of file From cdae39f2df9b1a52d22991277e391e359635d170 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Fri, 14 Dec 2018 16:56:06 +0100 Subject: [PATCH 088/648] corrected mapping of requested walltime for htcondor imports --- lapis/job_io/htcondor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapis/job_io/htcondor.py b/lapis/job_io/htcondor.py index 8280466..370edb3 100644 --- a/lapis/job_io/htcondor.py +++ b/lapis/job_io/htcondor.py @@ -6,7 +6,7 @@ def htcondor_job_reader(env, iterable, resource_name_mapping={ "cores": "RequestCpus", - "walltime": "RemoteWallClockTime", + "walltime": "RequestWalltime", "memory": "RequestMemory", "disk": "RequestDisk" }, used_resource_name_mapping={ From a8b1f6cde5ceb378340b9daab2994be4386f065d Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Fri, 14 Dec 2018 18:21:28 +0100 Subject: [PATCH 089/648] added setup and about files --- lapis/__about__.py | 16 ++++++++++++++ setup.py | 55 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 71 insertions(+) create mode 100644 lapis/__about__.py create mode 100644 setup.py diff --git a/lapis/__about__.py b/lapis/__about__.py new file mode 100644 index 0000000..5c57ae8 --- /dev/null +++ b/lapis/__about__.py @@ -0,0 +1,16 @@ +""" +=============================================================================== +Lapis is an adaptable, performant, and interactive scheduling (Lapis) simulator +=============================================================================== + +This is a **draft** for a scheduling simulator utilising opportunistic resources. +""" +__title__ = 'lapis' +__summary__ = 'Lapis is an adaptable, performant, and interactive scheduling (Lapis) simulator' +__url__ = 'https://github.com/MaineKuehn/lapis' + +__version__ = '0.1.0' +__author__ = 'Eileen Kuehn, Max Fischer' +__email__ = 'mainekuehn@gmail.com' +__copyright__ = '2018 %s' % __author__ +__keywords__ = 'opportunistic scheduling scheduler cobald simulator' diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..f1f20b7 --- /dev/null +++ b/setup.py @@ -0,0 +1,55 @@ +#!/usr/bin/env python +import os +from setuptools import setup, find_packages + +repo_base_dir = os.path.abspath(os.path.dirname(__file__)) +# pull in the packages metadata +package_about = {} +with open(os.path.join(repo_base_dir, "lapis", "__about__.py")) as about_file: + exec(about_file.read(), package_about) + + +with open(os.path.join(repo_base_dir, 'README.rst'), 'r') as README: + long_description = README.read() + +if __name__ == '__main__': + setup( + name=package_about['__title__'], + version=package_about['__version__'], + description=package_about['__summary__'], + long_description=long_description.strip(), + author=package_about['__author__'], + author_email=package_about['__email__'], + url=package_about['__url__'], + packages=find_packages(), + # dependencies + install_requires=[ + 'cobald', + 'simpy', + ], + extras_require={ + # 'docs': ["sphinx", "sphinxcontrib-tikz", "sphinx_rtd_theme"], + }, + # metadata for package search + license='MIT', + # https://pypi.python.org/pypi?%3Aaction=list_classifiers + classifiers=[ + 'Development Status :: 2 - Pre-Alpha', + 'Intended Audience :: Developers', + 'Intended Audience :: Information Technology', + 'Intended Audience :: Science/Research', + 'Intended Audience :: System Administrators', + 'Topic :: Adaptive Technologies', + 'Topic :: Office/Business :: Scheduling', + 'Topic :: System :: Distributed Computing', + 'Programming Language :: Python :: 3', + 'Programming Language :: Python :: 3.4', + 'Programming Language :: Python :: 3.5', + 'Programming Language :: Python :: 3.6', + ], + keywords=package_about['__keywords__'], + # unit tests + setup_requires=['pytest-runner'], + test_suite='lapis_tests', + tests_require=['pytest'], + ) From 8f86e955a10381e1d59b4ad81d001dc70eb26699 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Fri, 14 Dec 2018 18:29:14 +0100 Subject: [PATCH 090/648] added coveragerc --- .coveragerc | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 .coveragerc diff --git a/.coveragerc b/.coveragerc new file mode 100644 index 0000000..6f17f19 --- /dev/null +++ b/.coveragerc @@ -0,0 +1,19 @@ +[run] +source = lapis +branch = TRUE +cover_pylib = FALSE +parallel = False + +[report] +exclude_lines = + # default + pragma: no cover + # python debug/internals + def __repr__ + if __debug__: + assert + raise AssertionError + raise NotImplementedError + return NotImplemented + if __name__ == "__main__" + if __name__ == '__main__' \ No newline at end of file From 75cb71c18ba46765ea8bcce8667935193f36c2ba Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Fri, 14 Dec 2018 22:33:41 +0100 Subject: [PATCH 091/648] added test for importing jobs in swf format --- lapis_tests/data/swf_jobs.swf | 59 ++++++++++++++++++++++++++++++++++ lapis_tests/job_io/test_swf.py | 11 +++++++ 2 files changed, 70 insertions(+) create mode 100644 lapis_tests/data/swf_jobs.swf create mode 100644 lapis_tests/job_io/test_swf.py diff --git a/lapis_tests/data/swf_jobs.swf b/lapis_tests/data/swf_jobs.swf new file mode 100644 index 0000000..9fd7cd1 --- /dev/null +++ b/lapis_tests/data/swf_jobs.swf @@ -0,0 +1,59 @@ +; Example taken from Gaia export by Joseph Emeras (joseph.emeras@gmail.com) provided at Parallel Workloads Archive +; see: http://www.cs.huji.ac.il/labs/parallel/workload/l_unilu_gaia/index.html +; +; --- SWF file converted from SQL request to the OAR database and jobs resource usage collected by Colmet --- +; Version: 2.2 +; Computer: Bull / Dell mix +; Conversion: Joseph Emeras - uncleaned version - March 2015 +; +; Installation: UNILU Gaia Cluster +; Acknowledge: Joseph Emeras, SnT +; Information: https://hpc.uni.lu/systems/gaia/ +; +; MaxJobs: 51987 +; MaxRecords: 51987 +; +; UnixStartTime: 1400749079 +; +; TimeZoneString: Europe/Luxembourg +; StartTime: Thu May 22 10:57:59 CEST 2014 +; EndTime: Tue Aug 19 13:06:12 CEST 2014 +; +; MaxNodes: 151 +; MaxProcs: 2004 +; +; MaxQueues: 3 +; Queues: Queue nb 1 is default queue. Queue nb 2 is besteffort jobs queue. Besteffort jobs are low priority and preemptible jobs. Queue 0 is Interactive jobs. +; Queue: 1 - Normal jobs queue - normal priority - jobs are not preemptible +; Queue: 2 - Besteffort jobs queue - low priority - besteffort jobs can be preempted by normal jobs. +; Queue: 0 - Interactive jobs - normal priority - jobs are not preemptible - interactive jobs provide a direct shell on the nodes to the user. +; +; Note: Scheduler is OAR (http://oar.imag.fr/) +; +; +; 1| 2| 3| 4| 5| 6| 7| 8| 9| 10| 11| 12| 13| 14| 15| 16| 17| 18 +; | | | | | | | | | | | | | | | | | +; j| s| w| r| p| c| m| p| u| m| s| u| g| e| q| p| p| t +; o| u| a| u| r| p| e| r| s| e| t| i| i| x| | a| r| h +; b| b| i| n| o| u| m| o| e| m| a| d| d| e| n| r| e| i +; | m| t| t| c| | | c| r| | t| | | | u| t| v| n +; | i| | i| | u| u| | | r| u| | | n| m| i| | k +; | t| | m| a| s| s| r| e| e| s| | | u| | t| j| +; | | | e| l| e| e| e| s| q| | | | m| | i| o| t +; | | | | l| d| d| q| t| | | | | | | o| b| i +; | | | | o| | | | | | | | | | | n| | m +; | | | | c| | | | | | | | | | | | | e +; | | | | | | | | | | | | | | | | | +; MaxQueues: 3 +; Queue: 0 interactive interactive +; Queue: 1 default default +; Queue: 2 besteffort besteffort +; + 1 0 477768 35541 160 32096 89734 160 108000 -1 1 1 1 1 1 -1 -1 -1 + 2 83558 1 432024 36 1320 7566 36 432000 -1 0 2 2 2 1 -1 -1 -1 + 3 195861 1 278442 64 278442 34640 64 432000 -1 1 3 3 3 1 -1 -1 -1 + 4 278659 2 268225 4 4023 4864 4 345600 -1 1 4 4 4 1 -1 -1 -1 + 5 339016 1 305581 24 1783 14805 24 432000 -1 0 5 5 5 1 -1 -1 -1 + 6 339299 1 214651 24 358.00 2560 24 432000 -1 1 5 5 6 1 -1 -1 -1 + 7 340144 1 432006 24 720.00 2005 24 432000 -1 0 5 5 7 1 -1 -1 -1 + 8 340462 2 370083 24 925.00 4565 24 432000 -1 0 5 5 7 1 -1 -1 -1 \ No newline at end of file diff --git a/lapis_tests/job_io/test_swf.py b/lapis_tests/job_io/test_swf.py new file mode 100644 index 0000000..367f31d --- /dev/null +++ b/lapis_tests/job_io/test_swf.py @@ -0,0 +1,11 @@ +from lapis.job_io.swf import swf_job_reader + + +class TestSwfJobReader(object): + def test_simple_read(self): + with open("../data/swf_jobs.swf") as input_file: + job_count = 0 + for job in swf_job_reader(None, input_file): + assert job is not None + job_count += 1 + assert job_count > 0 From e150a449767934fef4f3d143c619a93ef1990ecd Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Fri, 14 Dec 2018 22:51:14 +0100 Subject: [PATCH 092/648] added setup config --- setup.cfg | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 setup.cfg diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..b7e4789 --- /dev/null +++ b/setup.cfg @@ -0,0 +1,2 @@ +[aliases] +test=pytest From 9b57c461c32b71921c407621332f978ad280824e Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Fri, 14 Dec 2018 22:59:11 +0100 Subject: [PATCH 093/648] using absolute paths to data test files --- lapis_tests/job_io/test_htcondor.py | 4 +++- lapis_tests/job_io/test_swf.py | 3 ++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/lapis_tests/job_io/test_htcondor.py b/lapis_tests/job_io/test_htcondor.py index ed5b22d..e5d14e4 100644 --- a/lapis_tests/job_io/test_htcondor.py +++ b/lapis_tests/job_io/test_htcondor.py @@ -1,9 +1,11 @@ +import os + from lapis.job_io.htcondor import htcondor_job_reader class TestHtcondorJobReader(object): def test_simple_read(self): - with open("../data/htcondor_jobs.csv") as input_file: + with open(os.path.join(os.path.dirname(__file__), "..", "data", "htcondor_jobs.csv")) as input_file: jobs = 0 for job in htcondor_job_reader(None, input_file): assert job is not None diff --git a/lapis_tests/job_io/test_swf.py b/lapis_tests/job_io/test_swf.py index 367f31d..3395af1 100644 --- a/lapis_tests/job_io/test_swf.py +++ b/lapis_tests/job_io/test_swf.py @@ -1,9 +1,10 @@ +import os from lapis.job_io.swf import swf_job_reader class TestSwfJobReader(object): def test_simple_read(self): - with open("../data/swf_jobs.swf") as input_file: + with open(os.path.join(os.path.dirname(__file__), "..", "data", "swf_jobs.swf")) as input_file: job_count = 0 for job in swf_job_reader(None, input_file): assert job is not None From 71213eb0fb34094c1986ea055e141836300915c6 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Wed, 19 Dec 2018 11:05:52 +0100 Subject: [PATCH 094/648] added example with 0 used walltime for htcondor data --- lapis_tests/data/htcondor_jobs.csv | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapis_tests/data/htcondor_jobs.csv b/lapis_tests/data/htcondor_jobs.csv index 74322b3..b692e23 100644 --- a/lapis_tests/data/htcondor_jobs.csv +++ b/lapis_tests/data/htcondor_jobs.csv @@ -1,5 +1,5 @@ Owner AcctGroup JobUniverse QDate JobStartDate CompletionDate RequestCpus RequestMemory MemoryUsage RequestDisk DiskUsage_RAW ExitCode ExitBySignal ExitSignal RequestWalltime RemoteWallClockTime RemoteSysCpu RemoteUserCpu -2648c1 b387f8 5 1526661075 1526662331 0 1 2100 1 12500000 16173 0 False None 12000 88.0 0.0 0.0 +2648c1 b387f8 5 1526661075 1526662331 0 1 2100 1 12500000 16173 0 False None 12000 0.0 0.0 0.0 2648c1 b387f8 5 1526661075 1526662330 0 1 2100 1 12500000 16173 0 False None 12000 89.0 0.0 0.0 2648c1 b387f8 5 1526661075 1526662330 0 1 2100 1 12500000 16173 0 False None 12000 88.0 0.0 0.0 64a794 461645.4a9963 5 1526756602 1526777752 1526990696 1 2000 419 750000 21226 0 False 1 18000 192431.0 1192.0 15670.0 From c99d77b4774d3544840858c7916ec07c5121de9d Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Wed, 19 Dec 2018 11:06:15 +0100 Subject: [PATCH 095/648] added test for correct removal of wrong data for htcondor import --- lapis_tests/job_io/test_htcondor.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/lapis_tests/job_io/test_htcondor.py b/lapis_tests/job_io/test_htcondor.py index e5d14e4..5911554 100644 --- a/lapis_tests/job_io/test_htcondor.py +++ b/lapis_tests/job_io/test_htcondor.py @@ -11,3 +11,7 @@ def test_simple_read(self): assert job is not None jobs += 1 assert jobs > 0 + with open(os.path.join(os.path.dirname(__file__), "..", "data", "htcondor_jobs.csv")) as input_file: + # ensure that one job was removed by importer (wrong walltime given) + lines = sum(1 for _ in input_file) + assert jobs == (lines - 2) From 138dc5bd6ba1e600d3a1b3b9ecf0cdf8cbdeec4e Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Wed, 20 Mar 2019 16:14:50 +0100 Subject: [PATCH 096/648] added importing of machine inforamation exported from htcondor --- lapis/pool_io/machines.py | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) create mode 100644 lapis/pool_io/machines.py diff --git a/lapis/pool_io/machines.py b/lapis/pool_io/machines.py new file mode 100644 index 0000000..143eac1 --- /dev/null +++ b/lapis/pool_io/machines.py @@ -0,0 +1,25 @@ +import csv + +from ..pool import Pool + + +def machines_pool_reader(env, iterable, resource_name_mapping={ + "cores": "CPUs_per_node", + "memory": "RAM_per_node_in_KB" +}, pool_type=Pool): + """ + Load a pool configuration that was exported via htcondor from files or iterables + + :param iterable: an iterable yielding lines of CSV, such as an open file + :param resource_name_mapping: Mapping from given header names to well-defined resources in simulation + :param pool_type: The type of pool to be yielded + :return: Yields the :py:class:`StaticPool`s found in the given iterable + """ + reader = csv.DictReader(iterable, delimiter=' ', skipinitialspace=True) + for row_idx, row in enumerate(reader): + yield pool_type( + env, + capacity=int(row["number_of_nodes"]), + resources={key: float(row[value]) for key, value in resource_name_mapping.items()}, + name=row["cluster_name"] + ) From 0fc0102e12a7f44ecdfa5a66ea3b0ec0c4eb0c9d Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Thu, 21 Mar 2019 21:56:41 +0100 Subject: [PATCH 097/648] exchanged simpy with usim simulator --- lapis/cli/simulate.py | 11 ++-- lapis/controller.py | 21 ++++---- lapis/drone.py | 104 +++++++++++++++++++++----------------- lapis/job.py | 31 +++++++----- lapis/job_io/htcondor.py | 3 +- lapis/job_io/swf.py | 5 +- lapis/pool.py | 89 ++++++++++++++++++-------------- lapis/pool_io/htcondor.py | 11 ++-- lapis/pool_io/machines.py | 10 ++-- lapis/scheduler.py | 91 +++++++++++++++++++-------------- lapis/simulator.py | 45 ++++++++++++----- 11 files changed, 245 insertions(+), 176 deletions(-) diff --git a/lapis/cli/simulate.py b/lapis/cli/simulate.py index d973c7a..001ca87 100644 --- a/lapis/cli/simulate.py +++ b/lapis/cli/simulate.py @@ -62,10 +62,13 @@ def static(ctx, job_file, pool_file): simulator = Simulator(seed=ctx.obj["seed"]) file, file_type = job_file simulator.create_job_generator(job_input=file, job_reader=job_import_mapper[file_type]) + simulator.create_scheduler(scheduler_type=CondorJobScheduler) for current_pool in pool_file: pool_file, pool_file_type = current_pool - simulator.create_pools(pool_input=pool_file, pool_reader=pool_import_mapper[pool_file_type], pool_type=StaticPool) - simulator.create_scheduler(scheduler_type=CondorJobScheduler) + simulator.create_pools( + pool_input=pool_file, + pool_reader=pool_import_mapper[pool_file_type], + pool_type=StaticPool) simulator.run(until=ctx.obj["until"]) @@ -78,6 +81,7 @@ def dynamic(ctx, job_file, pool_file): simulator = Simulator(seed=ctx.obj["seed"]) file, file_type = job_file simulator.create_job_generator(job_input=file, job_reader=job_import_mapper[file_type]) + simulator.create_scheduler(scheduler_type=CondorJobScheduler) for current_pool in pool_file: file, file_type = current_pool simulator.create_pools( @@ -85,7 +89,6 @@ def dynamic(ctx, job_file, pool_file): pool_reader=pool_import_mapper[file_type], pool_type=Pool, controller=SimulatedCostController) - simulator.create_scheduler(scheduler_type=CondorJobScheduler) simulator.run(until=ctx.obj["until"]) @@ -99,13 +102,13 @@ def hybrid(ctx, job_file, static_pool_file, dynamic_pool_file): simulator = Simulator(seed=ctx.obj["seed"]) file, file_type = job_file simulator.create_job_generator(job_input=file, job_reader=job_import_mapper[file_type]) + simulator.create_scheduler(scheduler_type=CondorJobScheduler) for current_pool in static_pool_file: file, file_type = current_pool simulator.create_pools(pool_input=file, pool_reader=pool_import_mapper[file_type], pool_type=StaticPool) for current_pool in dynamic_pool_file: file, file_type = current_pool simulator.create_pools(pool_input=file, pool_reader=pool_import_mapper[file_type], pool_type=Pool, controller=SimulatedCostController) - simulator.create_scheduler(scheduler_type=CondorJobScheduler) simulator.run(until=ctx.obj["until"]) diff --git a/lapis/controller.py b/lapis/controller.py index d6198d4..1bbd00b 100644 --- a/lapis/controller.py +++ b/lapis/controller.py @@ -1,42 +1,39 @@ from cobald.controller.linear import LinearController from cobald.controller.relative_supply import RelativeSupplyController from cobald.interfaces import Pool +from usim import time class SimulatedLinearController(LinearController): - def __init__(self, env, target: Pool, low_utilisation=0.5, high_allocation=0.5, rate=1, interval=1): + def __init__(self, target: Pool, low_utilisation=0.5, high_allocation=0.5, rate=1, interval=1): super(SimulatedLinearController, self).__init__(target, low_utilisation, high_allocation, rate, interval) - self.env = env - self.action = env.process(self.run()) - def run(self): + async def run(self): while True: self.regulate(interval=self.interval) # print("[controller] demand %d -> %d, supply %d (global %d), allocation %.2f, utilisation %.2f " # "(available %d)" % (pre_demand, self.target.demand, self.target.supply, globals.global_demand.level, # self.target.allocation, self.target.utilisation, self.target.level)) - yield self.env.timeout(self.interval) + await (time + self.interval) class SimulatedRelativeSupplyController(RelativeSupplyController): - def __init__(self, env, target: Pool, low_utilisation=0.5, high_allocation=0.5, low_scale=0.9, high_scale=1.1, + def __init__(self, target: Pool, low_utilisation=0.5, high_allocation=0.5, low_scale=0.9, high_scale=1.1, interval=1): super(SimulatedRelativeSupplyController, self).__init__(target=target, low_utilisation=low_utilisation, high_allocation=high_allocation, low_scale=low_scale, high_scale=high_scale, interval=interval) - self.env = env - self.action = env.process(self.run()) - def run(self): + async def run(self): while True: self.regulate(interval=self.interval) - yield self.env.timeout(self.interval) + await (time + self.interval) class SimulatedCostController(SimulatedLinearController): - def __init__(self, env, target: Pool, low_utilisation=0.5, high_allocation=0.5, rate=1, interval=1): + def __init__(self, target: Pool, low_utilisation=0.5, high_allocation=0.5, rate=1, interval=1): self.current_cost = 1 - super(SimulatedCostController, self).__init__(env, target, low_utilisation, high_allocation, rate, interval) + super(SimulatedCostController, self).__init__(target, low_utilisation, high_allocation, rate, interval) def regulate(self, interval): allocation = 0 diff --git a/lapis/drone.py b/lapis/drone.py index 7902c40..e7a9ca8 100644 --- a/lapis/drone.py +++ b/lapis/drone.py @@ -1,23 +1,29 @@ from cobald import interfaces +from usim import time, Scope class Drone(interfaces.Pool): - def __init__(self, env, pool_resources, scheduling_duration): + def __init__(self, scheduler, pool_resources: dict, scheduling_duration: float): super(Drone, self).__init__() - self.env = env + self.scheduler = scheduler self.pool_resources = pool_resources - self.action = env.process(self.run(scheduling_duration)) self.resources = {resource: 0 for resource in self.pool_resources} + self.scheduling_duration = scheduling_duration # shadowing requested resources to determine jobs to be killed self.used_resources = {resource: 0 for resource in self.pool_resources} - self._supply = 0 + if scheduling_duration == 0: + self._supply = 1 + self.scheduler.register_drone(self) + else: + self._supply = 0 self.jobs = 0 self._allocation = None self._utilisation = None - def run(self, scheduling_duration): - yield self.env.timeout(scheduling_duration) + async def run(self): + await (time + self.scheduling_duration) self._supply = 1 + self.scheduler.register_drone(self) @property def supply(self): @@ -45,17 +51,18 @@ def allocation(self): def _init_allocation_and_utilisation(self): resources = [] - for resource_key, value in self.resources.items(): + for resource_key, value in self.used_resources.items(): resources.append(value / self.pool_resources[resource_key]) self._allocation = max(resources) self._utilisation = min(resources) - def shutdown(self): + async def shutdown(self): self._supply = 0 - yield self.env.timeout(1) + self.scheduler.unregister_drone(self) + await (time + 1) # print("[drone %s] has been shut down" % self) - def start_job(self, job, kill=False): + async def start_job(self, job, kill=False): """ Method manages to start a job in the context of the given drone. The job is started independent of available resources. If resources of drone are exceeded, the job is killed. @@ -64,43 +71,46 @@ def start_job(self, job, kill=False): :param kill: if True, a job is killed when used resources exceed requested resources :return: """ - self._utilisation = None - self._allocation = None - self.jobs += 1 - job_execution = job.process() - for resource_key in job.resources: - try: - if self.used_resources[resource_key] + job.used_resources[resource_key] > self.pool_resources[resource_key]: - job.kill() - except KeyError: - # we do not have data about how many resources the job used, so check with requested data - if self.used_resources[resource_key] + job.resources[resource_key] > self.pool_resources[resource_key]: - job.kill() - try: - if job.resources[resource_key] < job.used_resources[resource_key]: - if kill: + async with Scope() as scope: + self._utilisation = None + self._allocation = None + self.jobs += 1 + job_execution = scope.do(job.run()) + # TODO: needs to be killed if resources are exceeding + for resource_key in job.resources: + try: + if self.used_resources[resource_key] + job.used_resources[resource_key] > self.pool_resources[resource_key]: job.kill() - else: - pass - except KeyError: - # check is not relevant if the data is not stored - pass - for resource_key in job.resources: - self.resources[resource_key] += job.resources[resource_key] - try: - self.used_resources[resource_key] += job.used_resources[resource_key] - except KeyError: - self.used_resources[resource_key] += job.resources[resource_key] - yield job_execution - self.jobs -= 1 - self._utilisation = None - self._allocation = None - for resource_key in job.resources: - self.resources[resource_key] -= job.resources[resource_key] - for resource_key in {*job.resources, *job.used_resources}: - try: - self.used_resources[resource_key] -= job.used_resources[resource_key] - except KeyError: - self.used_resources[resource_key] -= job.resources[resource_key] + except KeyError: + # we do not have data about how many resources the job used, so check with requested data + if self.used_resources[resource_key] + job.resources[resource_key] > self.pool_resources[resource_key]: + job.kill() + try: + if job.resources[resource_key] < job.used_resources[resource_key]: + if kill: + job.kill() + else: + pass + except KeyError: + # check is not relevant if the data is not stored + pass + for resource_key in job.resources: + self.resources[resource_key] += job.resources[resource_key] + for resource_key in {*job.resources, *job.used_resources}: + try: + self.used_resources[resource_key] += job.used_resources[resource_key] + except KeyError: + self.used_resources[resource_key] += job.resources[resource_key] + await job_execution + self.jobs -= 1 + self._utilisation = None + self._allocation = None + for resource_key in job.resources: + self.resources[resource_key] -= job.resources[resource_key] + for resource_key in {*job.resources, *job.used_resources}: + try: + self.used_resources[resource_key] -= job.used_resources[resource_key] + except KeyError: + self.used_resources[resource_key] -= job.resources[resource_key] # put drone back into pool queue # print("[drone %s] finished job at %d" % (self, self.env.now)) diff --git a/lapis/job.py b/lapis/job.py index 5d77de0..698f109 100644 --- a/lapis/job.py +++ b/lapis/job.py @@ -3,7 +3,10 @@ import simpy import logging +from usim import time + +# TODO: needs refactoring def job_demand(simulator): """ function randomly sets global user demand by using different strategies @@ -33,9 +36,9 @@ def job_demand(simulator): # print("[demand] raising user demand for %f at %d to %d" % (value, env.now, globals.global_demand.level)) +# TODO: needs refactoring class Job(object): - def __init__(self, env, resources, used_resources=None, in_queue_since=0, queue_date=0, name=None): - self.env = env + def __init__(self, resources, used_resources=None, in_queue_since=0, queue_date=0, name=None): self.resources = resources self.used_resources = used_resources self.walltime = used_resources.pop("walltime", None) @@ -53,11 +56,14 @@ def waiting_time(self): return self.in_queue_until - self.in_queue_since return float("Inf") - def process(self): - self.in_queue_until = self.env.now - self.processing = self.env.process(self._process()) - return self.processing + async def run(self): + self.in_queue_until = time.now + # self.processing = self.env.process(self._process()) + # return self.processing + await (time + self.walltime or self.requested_walltime) + print("%s: job finished after %s" % (time.now, self.walltime or self.requested_walltime)) + # TODO: not needed anymore? def _process(self): try: yield self.env.timeout(0, value=self) @@ -65,6 +71,7 @@ def _process(self): except simpy.exceptions.Interrupt: pass + # TODO: interrupt should be integrated def kill(self): # job exceeds either own requested resources or resources provided by drone self.processing.interrupt(cause=self) @@ -75,7 +82,7 @@ def job_property_generator(**kwargs): yield 10, {"memory": 8, "cores": 1, "disk": 100} -def job_to_queue_scheduler(job_generator, job_queue, env=None, **kwargs): +async def job_to_queue_scheduler(job_generator, job_queue, **kwargs): job = next(job_generator) base_date = job.queue_date current_time = 0 @@ -85,13 +92,13 @@ def job_to_queue_scheduler(job_generator, job_queue, env=None, **kwargs): if not job: job = next(job_generator) current_time = job.queue_date - base_date - if env.now >= current_time: + if time.now >= current_time: count += 1 - job.in_queue_since = env.now - job_queue.append(job) + job.in_queue_since = time.now + await job_queue.put(job) job = None else: if count > 0: - logging.info(str(round(env.now)), {"user_demand_new": count}) + logging.info(str(round(time.now)), {"user_demand_new": count}) count = 0 - yield env.timeout(1) + await (time == current_time) diff --git a/lapis/job_io/htcondor.py b/lapis/job_io/htcondor.py index 370edb3..a21f6d8 100644 --- a/lapis/job_io/htcondor.py +++ b/lapis/job_io/htcondor.py @@ -4,7 +4,7 @@ from lapis.job import Job -def htcondor_job_reader(env, iterable, resource_name_mapping={ +def htcondor_job_reader(iterable, resource_name_mapping={ "cores": "RequestCpus", "walltime": "RequestWalltime", "memory": "RequestMemory", @@ -23,7 +23,6 @@ def htcondor_job_reader(env, iterable, resource_name_mapping={ logging.getLogger("implementation").warning("removed job from htcondor import", row) continue yield Job( - env, resources={ key: float(row[resource_name_mapping[key]]) for key in resource_name_mapping }, used_resources={ diff --git a/lapis/job_io/swf.py b/lapis/job_io/swf.py index 21e63be..5493e98 100644 --- a/lapis/job_io/swf.py +++ b/lapis/job_io/swf.py @@ -3,7 +3,7 @@ from lapis.job import Job -def swf_job_reader(env, iterable, resource_name_mapping={ +def swf_job_reader(iterable, resource_name_mapping={ "cores": "Requested Number of Processors", "walltime": "Requested Time", "memory": "Requested Memory" @@ -36,7 +36,6 @@ def swf_job_reader(env, iterable, resource_name_mapping={ reader = csv.reader((line for line in iterable if line[0] != ';'), delimiter=' ', skipinitialspace=True) for row in reader: yield Job( - env, resources={ key: float(row[header[resource_name_mapping[key]]]) for key in ("cores", "memory", "walltime") @@ -44,6 +43,6 @@ def swf_job_reader(env, iterable, resource_name_mapping={ }, used_resources={ key: float(row[header[used_resource_name_mapping[key]]]) - for key in used_resource_name_mapping.keys() + for key in ("cores", "memory", "walltime") if float(row[header[used_resource_name_mapping[key]]]) >= 0 }, queue_date=float(row[header[used_resource_name_mapping["queuetime"]]]), name=row[header["Job Number"]]) diff --git a/lapis/pool.py b/lapis/pool.py index c360ee9..04959b7 100644 --- a/lapis/pool.py +++ b/lapis/pool.py @@ -1,10 +1,9 @@ -from simpy.resources import container +from typing import Generator, Callable from cobald import interfaces +from usim import time, eternity, Scope -from .drone import Drone - -class Pool(interfaces.Pool, container.Container): +class Pool(interfaces.Pool): """ A pool encapsulating a number of pools or drones. Given a specific demand, allocation and utilisation, the pool is able to adapt in terms of number of drones providing the given resources. @@ -14,15 +13,26 @@ class Pool(interfaces.Pool, container.Container): :param init: Number of pools to instantiate at creation time of the pool :param resources: Dictionary of resources available for each pool instantiated within the pool """ - def __init__(self, env, capacity=float('inf'), init=0, resources={"memory": 8000, "cores": 1}, name=None): - super(Pool, self).__init__(env, capacity, init) + def __init__(self, capacity=float('inf'), init=0, name=None, make_drone: Callable=None): + super(Pool, self).__init__() + assert make_drone + self.make_drone= make_drone self._drones = [] - self.env = env - self.resources = resources self.init_pool(init=init) self._demand = 1 self.name = name or id(self) - self.action = env.process(self.run()) + self.level = init + self._capacity = capacity + + def put(self, amount): + if self.level + amount > self._capacity: + raise ValueError + self.level += amount + + def get(self, amount): + if self.level - amount < 0: + raise ValueError + self.level -= amount def init_pool(self, init=0): """ @@ -31,39 +41,43 @@ def init_pool(self, init=0): :param init: Number of drones to create. """ for _ in range(init): - self._drones.append(Drone(self.env, self.resources, 0)) + self._drones.append(self.make_drone(0)) - def run(self): + # TODO: the run method currently needs to be called manually + async def run(self): """ Pool periodically checks the current demand and provided drones. If demand is higher than the current level, the pool takes care of initialising new drones. Otherwise drones get removed. """ - while True: - drones_required = self._demand - self.level - while drones_required > 0: - drones_required -= 1 - # start a new drone - self._drones.append(Drone(self.env, self.resources, 10)) - yield self.put(1) - if self.level > self._demand: - for drone in self._drones: - if drone.jobs == 0: + async with Scope() as scope: + while True: + drones_required = self._demand - self.level + while drones_required > 0: + drones_required -= 1 + # start a new drone + drone = self.make_drone(10) + scope.do(drone.run()) + self._drones.append(drone) + self.put(1) + if self.level > self._demand: + for drone in self._drones: + if drone.jobs == 0: + break + else: break - else: - break - yield self.get(1) - self._drones.remove(drone) - yield from drone.shutdown() - del drone - yield self.env.timeout(1) + self.get(1) + self._drones.remove(drone) + scope.do(drone.shutdown()) + del drone + await (time + 1) @property - def drones(self): + def drones(self) -> Generator[int, None, None]: for drone in self._drones: if drone.supply > 0: yield drone - def drone_demand(self): + def drone_demand(self) -> int: return len(self._drones) @property @@ -88,18 +102,18 @@ def utilisation(self) -> float: return 1 @property - def supply(self): + def supply(self) -> int: supply = 0 for drone in self._drones: supply += drone.supply return supply @property - def demand(self): + def demand(self) -> int: return self._demand @demand.setter - def demand(self, value): + def demand(self, value: int): if value > 0: self._demand = value else: @@ -111,18 +125,17 @@ class StaticPool(Pool): A static pool does not react on changing conditions regarding demand, allocation and utilisation but instead initialises the `capacity` of given drones with initialised `resources`. - :param env: Reference to the simulation env :param capacity: Maximum number of pools that can be instantiated within the pool :param resources: Dictionary of resources available for each pool instantiated within the pool """ - def __init__(self, env, capacity=0, resources={"memory": 8000, "cores": 1}): + def __init__(self, capacity=0, make_drone: Callable=None): assert capacity > 0, "Static pool was initialised without any resources..." - super(StaticPool, self).__init__(env, capacity=capacity, init=capacity, resources=resources) + super(StaticPool, self).__init__(capacity=capacity, init=capacity, make_drone=make_drone) self._demand = capacity - def run(self): + async def run(self): """ Pool runs forever and does not check if number of drones needs to be adapted. """ while True: - yield self.env.timeout(float("Inf")) + await eternity diff --git a/lapis/pool_io/htcondor.py b/lapis/pool_io/htcondor.py index 79696a3..81432a2 100644 --- a/lapis/pool_io/htcondor.py +++ b/lapis/pool_io/htcondor.py @@ -1,24 +1,27 @@ import csv +from functools import partial +from typing import Callable from ..pool import Pool -def htcondor_pool_reader(env, iterable, resource_name_mapping={ +def htcondor_pool_reader(iterable, resource_name_mapping: dict={ "cores": "TotalSlotCPUs", "disk": "TotalSlotDisk", "memory": "TotalSlotMemory" -}, pool_type=Pool): +}, pool_type: Callable=Pool, make_drone: Callable=None): """ Load a pool configuration that was exported via htcondor from files or iterables :param iterable: an iterable yielding lines of CSV, such as an open file :param resource_name_mapping: Mapping from given header names to well-defined resources in simulation :param pool_type: The type of pool to be yielded + :param make_drone: :return: Yields the :py:class:`StaticPool`s found in the given iterable """ + assert make_drone reader = csv.DictReader(iterable, delimiter=' ', skipinitialspace=True) for row_idx, row in enumerate(reader): yield pool_type( - env, capacity=int(row["Count"]), - resources={key: float(row[value]) for key, value in resource_name_mapping.items()}) + make_drone=partial(make_drone, {key: float(row[value]) for key, value in resource_name_mapping.items()})) diff --git a/lapis/pool_io/machines.py b/lapis/pool_io/machines.py index 143eac1..91a397e 100644 --- a/lapis/pool_io/machines.py +++ b/lapis/pool_io/machines.py @@ -1,12 +1,14 @@ import csv +from functools import partial +from typing import Callable from ..pool import Pool -def machines_pool_reader(env, iterable, resource_name_mapping={ +def machines_pool_reader(iterable, resource_name_mapping={ "cores": "CPUs_per_node", "memory": "RAM_per_node_in_KB" -}, pool_type=Pool): +}, pool_type=Pool, make_drone: Callable=None): """ Load a pool configuration that was exported via htcondor from files or iterables @@ -15,11 +17,11 @@ def machines_pool_reader(env, iterable, resource_name_mapping={ :param pool_type: The type of pool to be yielded :return: Yields the :py:class:`StaticPool`s found in the given iterable """ + assert make_drone reader = csv.DictReader(iterable, delimiter=' ', skipinitialspace=True) for row_idx, row in enumerate(reader): yield pool_type( - env, capacity=int(row["number_of_nodes"]), - resources={key: float(row[value]) for key, value in resource_name_mapping.items()}, + make_drone=partial(make_drone, {key: float(row[value]) for key, value in resource_name_mapping.items()}), name=row["cluster_name"] ) diff --git a/lapis/scheduler.py b/lapis/scheduler.py index 863845a..d0bb254 100644 --- a/lapis/scheduler.py +++ b/lapis/scheduler.py @@ -1,4 +1,10 @@ +from usim import time, Scope + + # TODO: does not work anymore as there is no method get_drone at pool +from lapis.drone import Drone + + def job_scheduler(simulator): while True: for pool in simulator.pools: @@ -22,50 +28,59 @@ class CondorJobScheduler(object): :param env: :return: """ - def __init__(self, env, job_queue, pools): - self.env = env + def __init__(self, job_queue): self.job_queue = job_queue - self.pools = pools - self.action = env.process(self.run()) + self.drone_list = [] - def run(self): + def register_drone(self, drone: Drone): + self.drone_list.append(drone) + + def unregister_drone(self, drone: Drone): + self.drone_list.remove(drone) + + async def run(self): # current_job = None # postponed_unmatched_job = False - while True: - for job in self.job_queue: - best_match = self._schedule_job(job) - if best_match: - self.env.process(best_match.start_job(job)) - self.job_queue.remove(job) - yield self.env.timeout(0) - yield self.env.timeout(60) + async with Scope() as scope: + temp = [] + while True: + async for job in self.job_queue: + best_match = self._schedule_job(job) + if best_match: + scope.do(best_match.start_job(job)) + else: + temp.append(job) + # put all the jobs that could not be scheduled back into the queue + while temp: + job = temp.pop() + await self.job_queue.put(job) + await (time + 60) - def _schedule_job(self, job): + def _schedule_job(self, job) -> Drone: priorities = {} - for pool in self.pools: - for drone in pool.drones: - cost = 0 - resource_types = {*drone.resources.keys(), *job.resources.keys()} - for resource_type in resource_types: - if resource_type not in drone.resources.keys(): - cost = float("Inf") - elif resource_type not in job.resources: - cost += drone.resources[resource_type] - drone.resources[resource_type] - elif (pool.resources[resource_type] - drone.resources[resource_type]) < \ - job.resources[resource_type]: - cost = float("Inf") - break - else: - cost += (pool.resources[resource_type] - drone.resources[resource_type]) // \ - job.resources[resource_type] - cost /= len(resource_types) - if cost <= 1: - # directly start job - return drone - try: - priorities[cost].append(drone) - except KeyError: - priorities[cost] = [drone] + for drone in self.drone_list: + cost = 0 + resource_types = {*drone.resources.keys(), *job.resources.keys()} + for resource_type in resource_types: + if resource_type not in drone.resources.keys(): + cost = float("Inf") + elif resource_type not in job.resources: + cost += drone.resources[resource_type] - drone.resources[resource_type] + elif (drone.pool_resources[resource_type] - drone.resources[resource_type]) < \ + job.resources[resource_type]: + cost = float("Inf") + break + else: + cost += (drone.pool_resources[resource_type] - drone.resources[resource_type]) // \ + job.resources[resource_type] + cost /= len(resource_types) + if cost <= 1: + # directly start job + return drone + try: + priorities[cost].append(drone) + except KeyError: + priorities[cost] = [drone] try: minimal_key = min(priorities) if minimal_key < float("Inf"): diff --git a/lapis/simulator.py b/lapis/simulator.py index 484e6be..703ed67 100644 --- a/lapis/simulator.py +++ b/lapis/simulator.py @@ -3,6 +3,10 @@ import simpy +from usim import run, time, until +from usim.basics import Queue + +from lapis.drone import Drone from lapis.job import job_to_queue_scheduler from lapis.utility.monitor import monitor, trace @@ -11,29 +15,46 @@ class Simulator(object): def __init__(self, seed=1234): random.seed(seed) resource_normalisation = {"memory": 2000} - monitor_data = partial(monitor, resource_normalisation) - self.env = simpy.Environment() - self.job_queue = [] + # monitor_data = partial(monitor, resource_normalisation) + self.job_queue = Queue() self.pools = [] + self.controllers = [] self.job_scheduler = None + self.job_generator = None self.cost = 0 - trace(self.env, monitor_data, resource_normalisation=resource_normalisation, simulator=self) + self._job_generators = [] + # trace(self.env, monitor_data, resource_normalisation=resource_normalisation, simulator=self) def create_job_generator(self, job_input, job_reader): - job_generator = job_to_queue_scheduler(job_generator=job_reader(self.env, job_input), - job_queue=self.job_queue, - env=self.env) - self.env.process(job_generator) + self._job_generators.append((job_input, job_reader)) def create_pools(self, pool_input, pool_reader, pool_type, controller=None): - for pool in pool_reader(env=self.env, iterable=pool_input, pool_type=pool_type): + assert self.job_scheduler, "Scheduler needs to be created before pools" + for pool in pool_reader(iterable=pool_input, pool_type=pool_type, make_drone=partial(Drone, self.job_scheduler)): self.pools.append(pool) if controller: - controller(self.env, target=pool, rate=1) + controller(target=pool, rate=1) + self.controllers.append(controller) def create_scheduler(self, scheduler_type): - self.job_scheduler = scheduler_type(env=self.env, job_queue=self.job_queue, pools=self.pools) + self.job_scheduler = scheduler_type(job_queue=self.job_queue) def run(self, until=2000): print("running until", until) - self.env.run(until=until) + run(self._simulate(until)) + + async def _simulate(self, end): + print("Starting simulation at %s" % time.now) + async with until(time == end) as while_running: + for pool in self.pools: + while_running.do(pool.run()) + for job_input, job_reader in self._job_generators: + while_running.do(self._queue_jobs(job_input, job_reader)) + while_running.do(self.job_scheduler.run()) + for controller in self.controllers: + while_running.do(controller.run()) + print("Finished simulation at %s" % time.now) + + async def _queue_jobs(self, job_input, job_reader): + await job_to_queue_scheduler(job_generator=job_reader(job_input), + job_queue=self.job_queue) From 9f7bb3aefeab493a8f0723c2119c5b59d509c437 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Mon, 25 Mar 2019 16:03:09 +0100 Subject: [PATCH 098/648] required changes to completely switch to usim instead of simpy --- lapis/controller.py | 3 - lapis/cost.py | 17 ++-- lapis/drone.py | 117 +++++++++++++++++--------- lapis/job.py | 30 +++---- lapis/pool.py | 4 +- lapis/scheduler.py | 27 +++--- lapis/simulator.py | 5 -- lapis/utility/monitor.py | 175 +++++++++++++++++++++++---------------- setup.py | 4 +- 9 files changed, 217 insertions(+), 165 deletions(-) diff --git a/lapis/controller.py b/lapis/controller.py index 1bbd00b..aba6fb7 100644 --- a/lapis/controller.py +++ b/lapis/controller.py @@ -11,9 +11,6 @@ def __init__(self, target: Pool, low_utilisation=0.5, high_allocation=0.5, rate= async def run(self): while True: self.regulate(interval=self.interval) - # print("[controller] demand %d -> %d, supply %d (global %d), allocation %.2f, utilisation %.2f " - # "(available %d)" % (pre_demand, self.target.demand, self.target.supply, globals.global_demand.level, - # self.target.allocation, self.target.utilisation, self.target.level)) await (time + self.interval) diff --git a/lapis/cost.py b/lapis/cost.py index 72b5dd5..4518910 100644 --- a/lapis/cost.py +++ b/lapis/cost.py @@ -1,13 +1,12 @@ def cobald_cost(simulator): - result = len(simulator.job_queue) - for pool in simulator.pools: - for drone in pool.drones: - result += 1 - tmp = 0 - for resource_key in pool.resources: - tmp += drone.resources[resource_key] / pool.resources[resource_key] - tmp /= len(pool.resources) - result -= tmp + result = len(simulator.job_scheduler.drone_list) + for drone in simulator.job_scheduler.drone_list: + result += 1 + tmp = 0 + for resource_key in drone.pool_resources: + tmp += drone.resources[resource_key] / drone.pool_resources[resource_key] + tmp /= len(drone.pool_resources) + result -= tmp return result diff --git a/lapis/drone.py b/lapis/drone.py index e7a9ca8..91ccf9f 100644 --- a/lapis/drone.py +++ b/lapis/drone.py @@ -1,5 +1,13 @@ +import logging + from cobald import interfaces -from usim import time, Scope +from usim import time, Scope, ActivityCancelled, instant, ActivityState + +from lapis.job import Job + + +class ResourcesExceeded(Exception): + ... class Drone(interfaces.Pool): @@ -8,9 +16,9 @@ def __init__(self, scheduler, pool_resources: dict, scheduling_duration: float): self.scheduler = scheduler self.pool_resources = pool_resources self.resources = {resource: 0 for resource in self.pool_resources} - self.scheduling_duration = scheduling_duration # shadowing requested resources to determine jobs to be killed self.used_resources = {resource: 0 for resource in self.pool_resources} + self.scheduling_duration = scheduling_duration if scheduling_duration == 0: self._supply = 1 self.scheduler.register_drone(self) @@ -21,30 +29,32 @@ def __init__(self, scheduler, pool_resources: dict, scheduling_duration: float): self._utilisation = None async def run(self): + from lapis.utility.monitor import sampling_required await (time + self.scheduling_duration) self._supply = 1 self.scheduler.register_drone(self) + await sampling_required.set(True) @property - def supply(self): + def supply(self) -> float: return self._supply @property - def demand(self): + def demand(self) -> float: return 1 @demand.setter - def demand(self, value): + def demand(self, value: float): pass # demand is always defined as 1 @property - def utilisation(self): + def utilisation(self) -> float: if self._utilisation is None: self._init_allocation_and_utilisation() return self._utilisation @property - def allocation(self): + def allocation(self) -> float: if self._allocation is None: self._init_allocation_and_utilisation() return self._allocation @@ -57,12 +67,35 @@ def _init_allocation_and_utilisation(self): self._utilisation = min(resources) async def shutdown(self): + from lapis.utility.monitor import sampling_required self._supply = 0 self.scheduler.unregister_drone(self) + await sampling_required.set(True) await (time + 1) # print("[drone %s] has been shut down" % self) - async def start_job(self, job, kill=False): + def _add_resources(self, keys: list, target: dict, source: dict, alternative_source: dict): + resources_exceeded = False + for resource_key in keys: + try: + value = target[resource_key] + source[resource_key] + except KeyError: + value = target[resource_key] + alternative_source[resource_key] + if value > self.pool_resources[resource_key]: + resources_exceeded = True + target[resource_key] = value + if resources_exceeded: + raise ResourcesExceeded() + + @staticmethod + def _remove_resources(keys: list, target: dict, source: dict, alternative_source: dict): + for resource_key in keys: + try: + target[resource_key] -= source[resource_key] + except KeyError: + target[resource_key] -= alternative_source[resource_key] + + async def start_job(self, job: Job, kill: bool=False): """ Method manages to start a job in the context of the given drone. The job is started independent of available resources. If resources of drone are exceeded, the job is killed. @@ -71,46 +104,50 @@ async def start_job(self, job, kill=False): :param kill: if True, a job is killed when used resources exceed requested resources :return: """ + # TODO: ensure that jobs cannot immediately started on the same drone until the jobs did not allocate resources async with Scope() as scope: - self._utilisation = None - self._allocation = None - self.jobs += 1 + from lapis.utility.monitor import sampling_required + self._utilisation = self._allocation = None + job_execution = scope.do(job.run()) - # TODO: needs to be killed if resources are exceeding - for resource_key in job.resources: - try: - if self.used_resources[resource_key] + job.used_resources[resource_key] > self.pool_resources[resource_key]: - job.kill() - except KeyError: - # we do not have data about how many resources the job used, so check with requested data - if self.used_resources[resource_key] + job.resources[resource_key] > self.pool_resources[resource_key]: - job.kill() + job_keys = {*job.resources, *job.used_resources} + + try: + self._add_resources(job_keys, self.used_resources, job.used_resources, job.resources) + except ResourcesExceeded: + job_execution.cancel() + try: + # TODO: should we really kill the job if it is only about resources and not used resources? + self._add_resources(job_keys, self.resources, job.resources, job.used_resources) + except ResourcesExceeded: + job_execution.cancel() + + for resource_key in job_keys: try: if job.resources[resource_key] < job.used_resources[resource_key]: if kill: - job.kill() + job_execution.cancel() else: pass except KeyError: # check is not relevant if the data is not stored pass - for resource_key in job.resources: - self.resources[resource_key] += job.resources[resource_key] - for resource_key in {*job.resources, *job.used_resources}: - try: - self.used_resources[resource_key] += job.used_resources[resource_key] - except KeyError: - self.used_resources[resource_key] += job.resources[resource_key] + await instant # waiting just a moment to enable job to set parameters + if job_execution.status != ActivityState.CANCELLED: + self.jobs += 1 + await sampling_required.set(True) await job_execution - self.jobs -= 1 - self._utilisation = None - self._allocation = None - for resource_key in job.resources: - self.resources[resource_key] -= job.resources[resource_key] - for resource_key in {*job.resources, *job.used_resources}: - try: - self.used_resources[resource_key] -= job.used_resources[resource_key] - except KeyError: - self.used_resources[resource_key] -= job.resources[resource_key] - # put drone back into pool queue - # print("[drone %s] finished job at %d" % (self, self.env.now)) + if job_execution.status == ActivityState.CANCELLED: + for resource_key in job_keys: + usage = job.used_resources.get(resource_key, None) or job.resources.get(resource_key, None) + value = usage / (job.resources.get(resource_key, None) or self.pool_resources[resource_key]) + if value > 1: + logging.info(str(round(time.now)), {"job_exceeds_%s" % resource_key: value}) + else: + self.jobs -= 1 + self._remove_resources(job_keys, self.resources, job.resources, job.used_resources) + self._remove_resources(job_keys, self.used_resources, job.used_resources, job.resources) + self._utilisation = self._allocation = None + await sampling_required.set(True) + + diff --git a/lapis/job.py b/lapis/job.py index 698f109..9b8fac2 100644 --- a/lapis/job.py +++ b/lapis/job.py @@ -1,9 +1,12 @@ import random import math -import simpy import logging -from usim import time +from usim import time, ActivityCancelled + + +class JobKilled(Exception): + ... # TODO: needs refactoring @@ -58,23 +61,14 @@ def waiting_time(self): async def run(self): self.in_queue_until = time.now - # self.processing = self.env.process(self._process()) - # return self.processing + logging.info(str(round(time.now)), { + "job_queue_time": self.queue_date, + "job_waiting_time": self.waiting_time + }) await (time + self.walltime or self.requested_walltime) - print("%s: job finished after %s" % (time.now, self.walltime or self.requested_walltime)) - - # TODO: not needed anymore? - def _process(self): - try: - yield self.env.timeout(0, value=self) - yield self.env.timeout(self.requested_walltime or self.walltime) - except simpy.exceptions.Interrupt: - pass - - # TODO: interrupt should be integrated - def kill(self): - # job exceeds either own requested resources or resources provided by drone - self.processing.interrupt(cause=self) + logging.info(str(round(time.now)), { + "job_wall_time": self.walltime or self.requested_walltime + }) def job_property_generator(**kwargs): diff --git a/lapis/pool.py b/lapis/pool.py index 04959b7..97ade86 100644 --- a/lapis/pool.py +++ b/lapis/pool.py @@ -16,7 +16,7 @@ class Pool(interfaces.Pool): def __init__(self, capacity=float('inf'), init=0, name=None, make_drone: Callable=None): super(Pool, self).__init__() assert make_drone - self.make_drone= make_drone + self.make_drone = make_drone self._drones = [] self.init_pool(init=init) self._demand = 1 @@ -60,7 +60,7 @@ async def run(self): self._drones.append(drone) self.put(1) if self.level > self._demand: - for drone in self._drones: + for drone in self.drones: # only consider drones, that supply resources if drone.jobs == 0: break else: diff --git a/lapis/scheduler.py b/lapis/scheduler.py index d0bb254..59280d3 100644 --- a/lapis/scheduler.py +++ b/lapis/scheduler.py @@ -1,5 +1,6 @@ -from usim import time, Scope +from collections import deque +from usim import time, Scope, each # TODO: does not work anymore as there is no method get_drone at pool from lapis.drone import Drone @@ -29,8 +30,10 @@ class CondorJobScheduler(object): :return: """ def __init__(self, job_queue): - self.job_queue = job_queue + self._stream_queue = job_queue self.drone_list = [] + self.interval = 60 + self.job_queue = [] def register_drone(self, drone: Drone): self.drone_list.append(drone) @@ -39,22 +42,18 @@ def unregister_drone(self, drone: Drone): self.drone_list.remove(drone) async def run(self): - # current_job = None - # postponed_unmatched_job = False async with Scope() as scope: - temp = [] - while True: - async for job in self.job_queue: + scope.do(self._collect_jobs()) + async for _ in each(interval=self.interval): + for job in self.job_queue: best_match = self._schedule_job(job) if best_match: scope.do(best_match.start_job(job)) - else: - temp.append(job) - # put all the jobs that could not be scheduled back into the queue - while temp: - job = temp.pop() - await self.job_queue.put(job) - await (time + 60) + self.job_queue.remove(job) + + async def _collect_jobs(self): + async for job in self._stream_queue: + self.job_queue.append(job) def _schedule_job(self, job) -> Drone: priorities = {} diff --git a/lapis/simulator.py b/lapis/simulator.py index 703ed67..4057817 100644 --- a/lapis/simulator.py +++ b/lapis/simulator.py @@ -1,21 +1,17 @@ import random from functools import partial -import simpy - from usim import run, time, until from usim.basics import Queue from lapis.drone import Drone from lapis.job import job_to_queue_scheduler -from lapis.utility.monitor import monitor, trace class Simulator(object): def __init__(self, seed=1234): random.seed(seed) resource_normalisation = {"memory": 2000} - # monitor_data = partial(monitor, resource_normalisation) self.job_queue = Queue() self.pools = [] self.controllers = [] @@ -23,7 +19,6 @@ def __init__(self, seed=1234): self.job_generator = None self.cost = 0 self._job_generators = [] - # trace(self.env, monitor_data, resource_normalisation=resource_normalisation, simulator=self) def create_job_generator(self, job_input, job_reader): self._job_generators.append((job_input, job_reader)) diff --git a/lapis/utility/monitor.py b/lapis/utility/monitor.py index 0340174..8a012f1 100644 --- a/lapis/utility/monitor.py +++ b/lapis/utility/monitor.py @@ -1,78 +1,107 @@ from functools import wraps +from typing import Callable -import simpy import logging +from usim import each, Flag, time + from lapis.cost import cobald_cost -from lapis.job import Job - -last_step = 0 - - -def trace(env, callback, resource_normalisation, simulator): - def get_wrapper(env_step, callback): - @wraps(env_step) - def tracing_step(): - if len(env._queue): - t, prio, eid, event = env._queue[0] - callback(t, prio, eid, event, resource_normalisation, simulator) - return env_step() - return tracing_step - env.step = get_wrapper(env.step, callback) - - -def monitor(data, t, prio, eid, event, resource_normalisation, simulator): - if event.value: - if isinstance(event.value, simpy.exceptions.Interrupt): - job = event.value.cause - for resource_key, usage in job.used_resources.items(): - value = usage / job.resources[resource_key] - if value > 1: - logging.info(str(round(t)), {"job_exceeds_%s" % resource_key: value}) - if isinstance(event.value, Job): - logging.info(str(round(t)), {"job_waiting_times": event.value.waiting_time}) - global last_step - if t > last_step: - # new data to be recorded - tmp = round(t) - last_step = tmp - pool_demand = 0 - pool_supply = 0 - pool_utilisation = 0 - pool_allocation = 0 - running_jobs = 0 - used_resources = 0 - unused_resources = 0 - available_resources = 0 - empty_drones = 0 - result = {} - for pool in simulator.pools: - pool_demand += pool.demand - pool_supply += pool.supply - result["pool_%s_supply" % pool.name] = pool.supply - pool_utilisation += pool.utilisation - pool_allocation += pool.allocation - for drone in pool.drones: - running_jobs += drone.jobs - if drone.allocation == 0: - empty_drones += 1 - for resource_key, usage in drone.resources.items(): - normalisation_factor = resource_normalisation.get(resource_key, 1) - used_resources += usage / normalisation_factor - unused_resources += (pool.resources[resource_key] - usage) / normalisation_factor - available_resources += pool.resources[resource_key] / normalisation_factor - result["user_demand"] = len(simulator.job_queue) - result["pool_demand"] = pool_demand - result["pool_supply"] = pool_supply - result["pool_utilisation"] = pool_utilisation - result["pool_allocation"] = pool_allocation - result["running_jobs"] = running_jobs - result["empty_drones"] = empty_drones - result["used_resources"] = used_resources - result["unused_resources"] = unused_resources - result["available_resources"] = available_resources - current_cost = cobald_cost(simulator) - result["cost"] = current_cost - simulator.cost += current_cost - result["acc_cost"] = simulator.cost - logging.info(str(tmp), result) +from lapis.simulator import Simulator + +sampling_required = Flag() + + +class Monitoring(object): + # TODO: we need to check how to integrate the normalization factor + def __init__(self, simulator: Simulator): + self.simulator = simulator + self._statistics = [] + + async def run(self): + async for _ in each(delay=1): + await sampling_required + await sampling_required.set(False) + result = {} + for statistic in self._statistics: + # do the logging + result.update(statistic(self.simulator)) + logging.info(str(round(time.now)), result) + + def register_statistic(self, statistic: Callable): + self._statistics.append(statistic) + + +def collect_resource_statistics(simulator: Simulator) -> dict: + empty_drones = 0 + drone_resources = {} + for drone in simulator.job_scheduler.drone_list: + if drone.allocation == 0: + empty_drones += 1 + for resource_key in {*drone.resources, *drone.used_resources}: + drone_resources.setdefault(resource_key, {}) + try: + drone_resources[resource_key]["reserved"] += drone.resources[resource_key] + except KeyError: + drone_resources[resource_key]["reserved"] = drone.resources[resource_key] + try: + drone_resources[resource_key]["used"] += drone.used_resources[resource_key] + except KeyError: + drone_resources[resource_key]["used"] = drone.used_resources[resource_key] + try: + drone_resources[resource_key]["available"] += drone.pool_resources[resource_key] - drone.resources[resource_key] + except KeyError: + drone_resources[resource_key]["available"] = drone.pool_resources[resource_key] - drone.resources[resource_key] + try: + drone_resources[resource_key]["total"] += drone.pool_resources[resource_key] + except KeyError: + drone_resources[resource_key]["total"] = drone.pool_resources[resource_key] + return { + "empty_drones": empty_drones, + "drone_resources": drone_resources + } + + +def collect_cobald_cost(simulator: Simulator) -> dict: + current_cost = cobald_cost(simulator) + simulator.cost += current_cost + return { + "cobald_cost": { + "current": current_cost, + "accumulated": simulator.cost + } + } + + +def collect_user_demand(simulator: Simulator) -> dict: + return { + "user_demand": len(simulator.job_scheduler.job_queue) + } + + +def collect_job_statistics(simulator: Simulator) -> dict: + result = 0 + for drone in simulator.job_scheduler.drone_list: + result += drone.jobs + return { + "running_jobs": result + } + + +def collect_pool_statistics(simulator: Simulator) -> dict: + pool_demand = {} + pool_supply = {} + pool_utilisation = {} + pool_allocation = {} + for pool in simulator.pools: + pool_demand[id(pool)] = pool.demand + pool_supply[id(pool)] = pool.supply + pool_utilisation[id(pool)] = pool.utilisation + pool_allocation[id(pool)] = pool.allocation + return { + "pool": { + "demand": pool_demand, + "supply": pool_supply, + "allocation": pool_allocation, + "utilisation": pool_utilisation + } + } diff --git a/setup.py b/setup.py index f1f20b7..79e7331 100644 --- a/setup.py +++ b/setup.py @@ -25,7 +25,8 @@ # dependencies install_requires=[ 'cobald', - 'simpy', + 'usim', + 'click' ], extras_require={ # 'docs': ["sphinx", "sphinxcontrib-tikz", "sphinx_rtd_theme"], @@ -46,6 +47,7 @@ 'Programming Language :: Python :: 3.4', 'Programming Language :: Python :: 3.5', 'Programming Language :: Python :: 3.6', + 'Programming Language :: Python :: 3.7' ], keywords=package_about['__keywords__'], # unit tests From 472dd6d50600b84e92ffc0b1ee9f26bc6b67e39d Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Mon, 25 Mar 2019 16:15:35 +0100 Subject: [PATCH 099/648] checking for conversion of fields for htcondor job import --- lapis/job_io/htcondor.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/lapis/job_io/htcondor.py b/lapis/job_io/htcondor.py index a21f6d8..3a061db 100644 --- a/lapis/job_io/htcondor.py +++ b/lapis/job_io/htcondor.py @@ -22,10 +22,15 @@ def htcondor_job_reader(iterable, resource_name_mapping={ if float(row[used_resource_name_mapping["walltime"]]) <= 0: logging.getLogger("implementation").warning("removed job from htcondor import", row) continue + resources = {} + for key in resource_name_mapping: + try: + resources[key] = float(row[resource_name_mapping[key]]) + except ValueError: + pass yield Job( - resources={ - key: float(row[resource_name_mapping[key]]) for key in resource_name_mapping - }, used_resources={ + resources=resources, + used_resources={ "cores": (float(row["RemoteSysCpu"]) + float(row["RemoteUserCpu"])) / float(row[used_resource_name_mapping["walltime"]]), "memory": float(row[used_resource_name_mapping["memory"]]), From 35937fb1fddf3a04cf1910aac93c629d6bbddf21 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Mon, 25 Mar 2019 16:18:56 +0100 Subject: [PATCH 100/648] adapted tests to new class definition --- lapis_tests/job_io/test_htcondor.py | 2 +- lapis_tests/job_io/test_swf.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/lapis_tests/job_io/test_htcondor.py b/lapis_tests/job_io/test_htcondor.py index 5911554..fd533e7 100644 --- a/lapis_tests/job_io/test_htcondor.py +++ b/lapis_tests/job_io/test_htcondor.py @@ -7,7 +7,7 @@ class TestHtcondorJobReader(object): def test_simple_read(self): with open(os.path.join(os.path.dirname(__file__), "..", "data", "htcondor_jobs.csv")) as input_file: jobs = 0 - for job in htcondor_job_reader(None, input_file): + for job in htcondor_job_reader(input_file): assert job is not None jobs += 1 assert jobs > 0 diff --git a/lapis_tests/job_io/test_swf.py b/lapis_tests/job_io/test_swf.py index 3395af1..3bb861f 100644 --- a/lapis_tests/job_io/test_swf.py +++ b/lapis_tests/job_io/test_swf.py @@ -6,7 +6,7 @@ class TestSwfJobReader(object): def test_simple_read(self): with open(os.path.join(os.path.dirname(__file__), "..", "data", "swf_jobs.swf")) as input_file: job_count = 0 - for job in swf_job_reader(None, input_file): + for job in swf_job_reader(input_file): assert job is not None job_count += 1 assert job_count > 0 From 863e1e7a78ab9f0de273950aeaa9f7024e2dfdab Mon Sep 17 00:00:00 2001 From: Max Fischer Date: Mon, 25 Mar 2019 16:56:46 +0100 Subject: [PATCH 101/648] Update setup.py with usim git URL --- setup.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/setup.py b/setup.py index 79e7331..d4afb84 100644 --- a/setup.py +++ b/setup.py @@ -31,6 +31,9 @@ extras_require={ # 'docs': ["sphinx", "sphinxcontrib-tikz", "sphinx_rtd_theme"], }, + dependency_links=[ + 'git+https://github.com/MaineKuehn/usim.git#egg=usim-0.0.1', + ] # metadata for package search license='MIT', # https://pypi.python.org/pypi?%3Aaction=list_classifiers From 7e550143a335604458fb7c158a34e800fd66a85e Mon Sep 17 00:00:00 2001 From: Max Fischer Date: Mon, 25 Mar 2019 17:47:06 +0100 Subject: [PATCH 102/648] Update setup.py Now that was embarrassing... --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index d4afb84..06c75ee 100644 --- a/setup.py +++ b/setup.py @@ -33,7 +33,7 @@ }, dependency_links=[ 'git+https://github.com/MaineKuehn/usim.git#egg=usim-0.0.1', - ] + ], # metadata for package search license='MIT', # https://pypi.python.org/pypi?%3Aaction=list_classifiers From c0e37a79b0cdf4e4bc7d16d543cb7d19718abbc4 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Mon, 25 Mar 2019 16:24:00 +0100 Subject: [PATCH 103/648] removed unnecessary parameters --- lapis/job.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/lapis/job.py b/lapis/job.py index 9b8fac2..9bfb703 100644 --- a/lapis/job.py +++ b/lapis/job.py @@ -41,6 +41,9 @@ def job_demand(simulator): # TODO: needs refactoring class Job(object): + __slots__ = ("resources", "used_resources", "walltime", "requested_walltime", "queue_date", "in_queue_since", + "in_queue_until", "name") + def __init__(self, resources, used_resources=None, in_queue_since=0, queue_date=0, name=None): self.resources = resources self.used_resources = used_resources @@ -50,7 +53,6 @@ def __init__(self, resources, used_resources=None, in_queue_since=0, queue_date= self.queue_date = queue_date self.in_queue_since = in_queue_since self.in_queue_until = None - self.processing = None self.name = name or id(self) @property From 2d8aadd98ebfe6d0bd9ba8a2d6ed85ee42233c0e Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Mon, 25 Mar 2019 16:40:11 +0100 Subject: [PATCH 104/648] added documentation for the description of a job --- lapis/job.py | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) diff --git a/lapis/job.py b/lapis/job.py index 9bfb703..93e4b1a 100644 --- a/lapis/job.py +++ b/lapis/job.py @@ -2,11 +2,7 @@ import math import logging -from usim import time, ActivityCancelled - - -class JobKilled(Exception): - ... +from usim import time # TODO: needs refactoring @@ -39,12 +35,23 @@ def job_demand(simulator): # print("[demand] raising user demand for %f at %d to %d" % (value, env.now, globals.global_demand.level)) -# TODO: needs refactoring class Job(object): __slots__ = ("resources", "used_resources", "walltime", "requested_walltime", "queue_date", "in_queue_since", "in_queue_until", "name") - def __init__(self, resources, used_resources=None, in_queue_since=0, queue_date=0, name=None): + def __init__(self, resources: dict, used_resources: dict, in_queue_since: float=0, queue_date: float=0, + name: str=None): + """ + Definition of a job that uses a specified amount of resources `used_resources` over a given amount of time, + `walltime`. A job is described by its user via the parameter `resources`. This is a user prediction and is + expected to deviate from `used_resources`. + + :param resources: Requested resources of the job + :param used_resources: Resource usage of the job + :param in_queue_since: Time when job was inserted into the queue of the simulation scheduler + :param queue_date: Time when job was inserted into queue in real life + :param name: Name of the job + """ self.resources = resources self.used_resources = used_resources self.walltime = used_resources.pop("walltime", None) @@ -56,7 +63,12 @@ def __init__(self, resources, used_resources=None, in_queue_since=0, queue_date= self.name = name or id(self) @property - def waiting_time(self): + def waiting_time(self) -> float: + """ + The time the job spent in the simulators scheduling queue. `Inf` when the job is still waitiing. + + :return: Time in queue + """ if self.in_queue_until is not None: return self.in_queue_until - self.in_queue_since return float("Inf") From c546101d74fe511848f0b2945f4e52f1e2277615 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Tue, 26 Mar 2019 15:56:28 +0100 Subject: [PATCH 105/648] adapted gitignore --- .gitignore | 221 ++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 220 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 0255a5d..666c4a5 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,225 @@ +# Created by .ignore support plugin (hsz.mobi) +### Windows template +# Windows thumbnail cache files +Thumbs.db +ehthumbs.db +ehthumbs_vista.db + +# Dump file +*.stackdump + +# Folder config file +[Dd]esktop.ini + +# Recycle Bin used on file shares +$RECYCLE.BIN/ + +# Windows Installer files +*.cab +*.msi +*.msm +*.msp + +# Windows shortcuts +*.lnk +### JetBrains template +# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm +# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 + +# User-specific stuff: +.idea/**/workspace.xml +.idea/**/tasks.xml +.idea/dictionaries + +# Sensitive or high-churn files: +.idea/**/dataSources/ +.idea/**/dataSources.ids +.idea/**/dataSources.xml +.idea/**/dataSources.local.xml +.idea/**/sqlDataSources.xml +.idea/**/dynamic.xml +.idea/**/uiDesigner.xml + +# Gradle: +.idea/**/gradle.xml +.idea/**/libraries + +# CMake +cmake-build-debug/ +cmake-build-release/ + +# Mongo Explorer plugin: +.idea/**/mongoSettings.xml + +## File-based project format: +*.iws + +## Plugin-specific files: + +# IntelliJ +out/ + +# mpeltonen/sbt-idea plugin +.idea_modules/ + +# JIRA plugin +atlassian-ide-plugin.xml + +# Cursive Clojure plugin +.idea/replstate.xml + +# Crashlytics plugin (for Android Studio and IntelliJ) +com_crashlytics_export_strings.xml +crashlytics.properties +crashlytics-build.properties +fabric.properties +### Python template +# Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +.hypothesis/ +.pytest_cache +# Translations +*.mo +*.pot + +# Django stuff: +*.log +.static_storage/ +.media/ +local_settings.py + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ +docs/_static/ +docs/_templates/ + +# PyBuilder +target/ + +# Jupyter Notebook .ipynb_checkpoints -.idea + +# pyenv +.python-version + +# celery beat schedule file +celerybeat-schedule + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +### Linux template +*~ + +# temporary files which can be created if a process still has a handle open of a deleted file +.fuse_hidden* + +# KDE directory preferences +.directory + +# Linux trash folder which might appear on any partition or disk +.Trash-* + +# .nfs files are created when an open file is removed but is still being accessed +.nfs* +### macOS template +# General +.DS_Store +.AppleDouble +.LSOverride + +# Icon must end with two \r +Icon + +# Thumbnails +._* + +# Files that might appear in the root of a volume +.DocumentRevisions-V100 +.fseventsd +.Spotlight-V100 +.TemporaryItems +.Trashes +.VolumeIcon.icns +.com.apple.timemachine.donotpresent + +# Directories potentially created on remote AFP share +.AppleDB +.AppleDesktop +Network Trash Folder +Temporary Items +.apdisk From 83e9e268248063e308bd222709756b61fd0f1541 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Tue, 26 Mar 2019 16:15:10 +0100 Subject: [PATCH 106/648] added skeleton for documentation --- .readthedocs.yml | 5 ++ docs/Makefile | 19 +++++ docs/conf.py | 198 +++++++++++++++++++++++++++++++++++++++++++++++ docs/index.rst | 20 +++++ docs/make.bat | 35 +++++++++ setup.py | 2 +- 6 files changed, 278 insertions(+), 1 deletion(-) create mode 100644 .readthedocs.yml create mode 100644 docs/Makefile create mode 100644 docs/conf.py create mode 100644 docs/index.rst create mode 100644 docs/make.bat diff --git a/.readthedocs.yml b/.readthedocs.yml new file mode 100644 index 0000000..b752bb3 --- /dev/null +++ b/.readthedocs.yml @@ -0,0 +1,5 @@ +python: + version: 3 + pip_install: true + extra_requirements: + - docs diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 0000000..298ea9e --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,19 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line. +SPHINXOPTS = +SPHINXBUILD = sphinx-build +SOURCEDIR = . +BUILDDIR = _build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) \ No newline at end of file diff --git a/docs/conf.py b/docs/conf.py new file mode 100644 index 0000000..a6437ab --- /dev/null +++ b/docs/conf.py @@ -0,0 +1,198 @@ +# -*- coding: utf-8 -*- +# +# Configuration file for the Sphinx documentation builder. +# +# This file does only contain a selection of the most common options. For a +# full list see the documentation: +# http://www.sphinx-doc.org/en/master/config + +# -- Path setup -------------------------------------------------------------- + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +# +# import os +# import sys +# sys.path.insert(0, os.path.abspath('.')) +import os +import sys +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) +from usim import __about__ + + +# -- Project information ----------------------------------------------------- + +project = __about__.__title__ +copyright = __about__.__copyright__ +author = __about__.__author__ + +# The short X.Y version +version = __about__.__version__ +# The full version, including alpha/beta/rc tags +release = version + + +# -- General configuration --------------------------------------------------- + +# If your documentation needs a minimal Sphinx version, state it here. +# +# needs_sphinx = '1.0' + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = [ + 'sphinx.ext.autodoc', + 'sphinx.ext.intersphinx', + 'sphinx.ext.todo', + 'sphinx.ext.imgmath', + 'sphinx.ext.viewcode', +] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# The suffix(es) of source filenames. +# You can specify multiple suffix as a list of string: +# +# source_suffix = ['.rst', '.md'] +source_suffix = '.rst' + +# The master toctree document. +master_doc = 'index' + +# The language for content autogenerated by Sphinx. Refer to documentation +# for a list of supported languages. +# +# This is also used if you do content translation via gettext catalogs. +# Usually you set "language" from the command line for these cases. +language = None + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This pattern also affects html_static_path and html_extra_path. +exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] + +# The name of the Pygments (syntax highlighting) style to use. +pygments_style = None + + +# -- Options for HTML output ------------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +# +html_theme = 'sphinx_rtd_theme' + +# Theme options are theme-specific and customize the look and feel of a theme +# further. For a list of options available for each theme, see the +# documentation. +# +# html_theme_options = {} + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ['_static'] + +# Custom sidebar templates, must be a dictionary that maps document names +# to template names. +# +# The default sidebars (for documents that don't match any pattern) are +# defined by theme itself. Builtin themes are using these templates by +# default: ``['localtoc.html', 'relations.html', 'sourcelink.html', +# 'searchbox.html']``. +# +# html_sidebars = {} + + +# -- Options for HTMLHelp output --------------------------------------------- + +# Output file base name for HTML help builder. +htmlhelp_basename = 'lapisdoc' + + +# -- Options for LaTeX output ------------------------------------------------ + +latex_elements = { + # The paper size ('letterpaper' or 'a4paper'). + # + # 'papersize': 'letterpaper', + + # The font size ('10pt', '11pt' or '12pt'). + # + # 'pointsize': '10pt', + + # Additional stuff for the LaTeX preamble. + # + # 'preamble': '', + + # Latex figure (float) alignment + # + # 'figure_align': 'htbp', +} + +# Grouping the document tree into LaTeX files. List of tuples +# (source start file, target name, title, +# author, documentclass [howto, manual, or own class]). +latex_documents = [ + (master_doc, 'lapis.tex', 'lapis Documentation', + 'Eileen Kuehn, Max Fischer', 'manual'), +] + + +# -- Options for manual page output ------------------------------------------ + +# One entry per manual page. List of tuples +# (source start file, name, description, authors, manual section). +man_pages = [ + (master_doc, 'lapis', 'lapis Documentation', + [author], 1) +] + + +# -- Options for Texinfo output ---------------------------------------------- + +# Grouping the document tree into Texinfo files. List of tuples +# (source start file, target name, title, author, +# dir menu entry, description, category) +texinfo_documents = [ + (master_doc, 'lapis', 'lapis Documentation', + author, 'lapis', 'One line description of project.', + 'Miscellaneous'), +] + + +# -- Options for Epub output ------------------------------------------------- + +# Bibliographic Dublin Core info. +epub_title = project + +# The unique identifier of the text. This can be a ISBN number +# or the project homepage. +# +# epub_identifier = '' + +# A unique identification for the text. +# +# epub_uid = '' + +# A list of files that should not be packed into the epub file. +epub_exclude_files = ['search.html'] + + +# -- Extension configuration ------------------------------------------------- + +# -- Options for intersphinx extension --------------------------------------- + +# Example configuration for intersphinx: refer to the Python standard library. +intersphinx_mapping = { + "python": ('https://docs.python.org/3', None), + "usim": ('https://usim.readthedocs.io/en/stable', None), +} + +# -- Options for todo extension ---------------------------------------------- + +# If true, `todo` and `todoList` produce output, else they produce nothing. +todo_include_todos = True diff --git a/docs/index.rst b/docs/index.rst new file mode 100644 index 0000000..3b0e9d5 --- /dev/null +++ b/docs/index.rst @@ -0,0 +1,20 @@ +.. lapis documentation master file, created by + sphinx-quickstart on Tue Mar 26 16:10:43 2019. + You can adapt this file completely to your liking, but it should at least + contain the root `toctree` directive. + +Welcome to lapis's documentation! +================================= + +.. toctree:: + :maxdepth: 2 + :caption: Contents: + + + +Indices and tables +================== + +* :ref:`genindex` +* :ref:`modindex` +* :ref:`search` diff --git a/docs/make.bat b/docs/make.bat new file mode 100644 index 0000000..27f573b --- /dev/null +++ b/docs/make.bat @@ -0,0 +1,35 @@ +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set SOURCEDIR=. +set BUILDDIR=_build + +if "%1" == "" goto help + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.http://sphinx-doc.org/ + exit /b 1 +) + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% + +:end +popd diff --git a/setup.py b/setup.py index 06c75ee..d20a4be 100644 --- a/setup.py +++ b/setup.py @@ -29,7 +29,7 @@ 'click' ], extras_require={ - # 'docs': ["sphinx", "sphinxcontrib-tikz", "sphinx_rtd_theme"], + 'docs': ["sphinx", "sphinxcontrib-tikz"], }, dependency_links=[ 'git+https://github.com/MaineKuehn/usim.git#egg=usim-0.0.1', From 26bfde103a57948d6481d1de0ebbedf5897d6072 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Tue, 26 Mar 2019 16:28:20 +0100 Subject: [PATCH 107/648] adapted version of usim --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index d20a4be..76af12f 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ 'docs': ["sphinx", "sphinxcontrib-tikz"], }, dependency_links=[ - 'git+https://github.com/MaineKuehn/usim.git#egg=usim-0.0.1', + 'git+https://github.com/MaineKuehn/usim.git#egg=usim-0.1.0', ], # metadata for package search license='MIT', From 56f33b8c47f67f3aa45ac730b30760be84b3ea8f Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Tue, 26 Mar 2019 16:42:19 +0100 Subject: [PATCH 108/648] module index for documentation --- docs/conf.py | 16 +++++++++++++++- lapis/__about__.py | 2 +- 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index a6437ab..931229a 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -18,7 +18,7 @@ import os import sys sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) -from usim import __about__ +from lapis import __about__ # -- Project information ----------------------------------------------------- @@ -196,3 +196,17 @@ # If true, `todo` and `todoList` produce output, else they produce nothing. todo_include_todos = True + +# -- Custom Extensions ------------------------------------------------------- + + +def run_apidoc(_): + """Run the `apidoc` tool to generate `autodoc` documentation for all modules""" + from sphinx.apidoc import main + output_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), 'source', 'api')) + source_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', __about__.__title__)) + main(['--module-first', '--separate', '--output-dir=' + output_dir, source_dir, '--force']) + + +def setup(app): + app.connect('builder-inited', run_apidoc) diff --git a/lapis/__about__.py b/lapis/__about__.py index 5c57ae8..931ce92 100644 --- a/lapis/__about__.py +++ b/lapis/__about__.py @@ -12,5 +12,5 @@ __version__ = '0.1.0' __author__ = 'Eileen Kuehn, Max Fischer' __email__ = 'mainekuehn@gmail.com' -__copyright__ = '2018 %s' % __author__ +__copyright__ = '2019 %s' % __author__ __keywords__ = 'opportunistic scheduling scheduler cobald simulator' From 4452b1f2aaeebb8d54bd96c9ce94c5fb9f5e38be Mon Sep 17 00:00:00 2001 From: Max Fischer Date: Tue, 26 Mar 2019 16:51:22 +0100 Subject: [PATCH 109/648] Update setup.py git egg fragment --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 76af12f..9147dc8 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ 'docs': ["sphinx", "sphinxcontrib-tikz"], }, dependency_links=[ - 'git+https://github.com/MaineKuehn/usim.git#egg=usim-0.1.0', + 'git+https://github.com/MaineKuehn/usim.git#egg=usim', ], # metadata for package search license='MIT', From 2a93360a622825aa71dd6ec66d5d2bd46ca5c988 Mon Sep 17 00:00:00 2001 From: Max Fischer Date: Tue, 26 Mar 2019 16:54:01 +0100 Subject: [PATCH 110/648] Update RTD config to v2 --- .readthedocs.yml | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/.readthedocs.yml b/.readthedocs.yml index b752bb3..076647a 100644 --- a/.readthedocs.yml +++ b/.readthedocs.yml @@ -1,5 +1,13 @@ +version: 2 + python: version: 3 - pip_install: true - extra_requirements: - - docs + install: + - method: pip + path: . + extra_requirements: + - docs + +sphinx: + builder: html + configuration: docs/conf.py From b72318b4468659817d22e62ea7f143d13fc0eac1 Mon Sep 17 00:00:00 2001 From: Max Fischer Date: Tue, 26 Mar 2019 17:00:26 +0100 Subject: [PATCH 111/648] explicit usim path --- .readthedocs.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.readthedocs.yml b/.readthedocs.yml index 076647a..c3fc256 100644 --- a/.readthedocs.yml +++ b/.readthedocs.yml @@ -3,6 +3,8 @@ version: 2 python: version: 3 install: + - method: pip + path: "git+https://github.com/MaineKuehn/usim.git#egg=usim" - method: pip path: . extra_requirements: From b183b0be32bbacebd5f49b0ac2c2bcc66a6cdd76 Mon Sep 17 00:00:00 2001 From: Max Fischer Date: Tue, 26 Mar 2019 17:02:57 +0100 Subject: [PATCH 112/648] Create requirements.txt --- requirements.txt | 1 + 1 file changed, 1 insertion(+) create mode 100644 requirements.txt diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..8924c1d --- /dev/null +++ b/requirements.txt @@ -0,0 +1 @@ +git+https://github.com/MaineKuehn/usim.git@master#egg=usim From 682af1067adfd24ea78f77f5343c7f94c60ac118 Mon Sep 17 00:00:00 2001 From: Max Fischer Date: Tue, 26 Mar 2019 17:03:10 +0100 Subject: [PATCH 113/648] Update .readthedocs.yml --- .readthedocs.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.readthedocs.yml b/.readthedocs.yml index c3fc256..076647a 100644 --- a/.readthedocs.yml +++ b/.readthedocs.yml @@ -3,8 +3,6 @@ version: 2 python: version: 3 install: - - method: pip - path: "git+https://github.com/MaineKuehn/usim.git#egg=usim" - method: pip path: . extra_requirements: From 7bf5c62fe8d76a0b6ee520e2b39c6bb5b3106c5f Mon Sep 17 00:00:00 2001 From: Max Fischer Date: Tue, 26 Mar 2019 17:03:54 +0100 Subject: [PATCH 114/648] Update .readthedocs.yml --- .readthedocs.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.readthedocs.yml b/.readthedocs.yml index 076647a..1d29212 100644 --- a/.readthedocs.yml +++ b/.readthedocs.yml @@ -3,6 +3,7 @@ version: 2 python: version: 3 install: + - requirements: requirements.txt - method: pip path: . extra_requirements: From 805650990f5fffa4f6daea9d8d8dc2aa769bd9b5 Mon Sep 17 00:00:00 2001 From: Max Fischer Date: Tue, 26 Mar 2019 17:26:51 +0100 Subject: [PATCH 115/648] apparently, pip understands PEP 508 now... --- .readthedocs.yml | 1 - requirements.txt | 1 - setup.py | 5 +---- 3 files changed, 1 insertion(+), 6 deletions(-) delete mode 100644 requirements.txt diff --git a/.readthedocs.yml b/.readthedocs.yml index 1d29212..076647a 100644 --- a/.readthedocs.yml +++ b/.readthedocs.yml @@ -3,7 +3,6 @@ version: 2 python: version: 3 install: - - requirements: requirements.txt - method: pip path: . extra_requirements: diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index 8924c1d..0000000 --- a/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -git+https://github.com/MaineKuehn/usim.git@master#egg=usim diff --git a/setup.py b/setup.py index 9147dc8..c7758b0 100644 --- a/setup.py +++ b/setup.py @@ -25,15 +25,12 @@ # dependencies install_requires=[ 'cobald', - 'usim', + 'usim@git+https://github.com/MaineKuehn/usim.git@master#egg=usim-0.1.0', 'click' ], extras_require={ 'docs': ["sphinx", "sphinxcontrib-tikz"], }, - dependency_links=[ - 'git+https://github.com/MaineKuehn/usim.git#egg=usim', - ], # metadata for package search license='MIT', # https://pypi.python.org/pypi?%3Aaction=list_classifiers From 72d1a99ca6a01cb32a97678c2c6ada822f54c91c Mon Sep 17 00:00:00 2001 From: Max Fischer Date: Tue, 26 Mar 2019 17:31:31 +0100 Subject: [PATCH 116/648] ...but not pip on travis... --- .travis.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 3da6e92..7ac663c 100644 --- a/.travis.yml +++ b/.travis.yml @@ -12,6 +12,7 @@ matrix: allow_failures: - python: "3.7-dev" before_script: + - pip install pip --upgrade - export PYTHONHASHSEED=${PYTHONHASHSEED:-${RANDOM}} - echo "export PYTHONHASHSEED=${PYTHONHASHSEED}" - pip install codecov @@ -19,4 +20,4 @@ before_script: script: - coverage run setup.py test after_success: - - coverage report && codecov \ No newline at end of file + - coverage report && codecov From 5f2a2f1665ce679a56aaea330645e3e95dd95d36 Mon Sep 17 00:00:00 2001 From: Max Fischer Date: Tue, 26 Mar 2019 17:37:19 +0100 Subject: [PATCH 117/648] ...and not with a simple fix either... --- .travis.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.travis.yml b/.travis.yml index 7ac663c..d0c38bc 100644 --- a/.travis.yml +++ b/.travis.yml @@ -13,8 +13,10 @@ matrix: - python: "3.7-dev" before_script: - pip install pip --upgrade + - pip --version - export PYTHONHASHSEED=${PYTHONHASHSEED:-${RANDOM}} - echo "export PYTHONHASHSEED=${PYTHONHASHSEED}" + - pip install . - pip install codecov - export COVERAGE_PROCESS_START=$(pwd)/.coveragerc script: From 5c4977ee0d091084329250d87cdc0501ca902161 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Mon, 8 Apr 2019 14:13:16 +0200 Subject: [PATCH 118/648] corrected properties of pools --- lapis/job.py | 8 ++++++-- lapis/pool.py | 22 +++++++++++----------- 2 files changed, 17 insertions(+), 13 deletions(-) diff --git a/lapis/job.py b/lapis/job.py index 93e4b1a..2c49eaf 100644 --- a/lapis/job.py +++ b/lapis/job.py @@ -37,7 +37,7 @@ def job_demand(simulator): class Job(object): __slots__ = ("resources", "used_resources", "walltime", "requested_walltime", "queue_date", "in_queue_since", - "in_queue_until", "name") + "in_queue_until", "_name") def __init__(self, resources: dict, used_resources: dict, in_queue_since: float=0, queue_date: float=0, name: str=None): @@ -60,7 +60,11 @@ def __init__(self, resources: dict, used_resources: dict, in_queue_since: float= self.queue_date = queue_date self.in_queue_since = in_queue_since self.in_queue_until = None - self.name = name or id(self) + self._name = name + + @property + def name(self) -> str: + return self._name or id(self) @property def waiting_time(self) -> float: diff --git a/lapis/pool.py b/lapis/pool.py index 97ade86..8472d5c 100644 --- a/lapis/pool.py +++ b/lapis/pool.py @@ -8,33 +8,33 @@ class Pool(interfaces.Pool): A pool encapsulating a number of pools or drones. Given a specific demand, allocation and utilisation, the pool is able to adapt in terms of number of drones providing the given resources. - :param env: Reference to the simulation env :param capacity: Maximum number of pools that can be instantiated within the pool :param init: Number of pools to instantiate at creation time of the pool - :param resources: Dictionary of resources available for each pool instantiated within the pool + :param name: Name of the pool + :param make_drone: Callable to create a drone with specific properties for this pool """ - def __init__(self, capacity=float('inf'), init=0, name=None, make_drone: Callable=None): + def __init__(self, capacity: float=float('inf'), init: float=0, name: str=None, make_drone: Callable=None): super(Pool, self).__init__() assert make_drone self.make_drone = make_drone self._drones = [] self.init_pool(init=init) self._demand = 1 - self.name = name or id(self) self.level = init self._capacity = capacity + self._name = name - def put(self, amount): + def put(self, amount: float): if self.level + amount > self._capacity: raise ValueError self.level += amount - def get(self, amount): + def get(self, amount: float): if self.level - amount < 0: raise ValueError self.level -= amount - def init_pool(self, init=0): + def init_pool(self, init: float=0): """ Initialisation of existing drones at creation time of pool. @@ -102,18 +102,18 @@ def utilisation(self) -> float: return 1 @property - def supply(self) -> int: + def supply(self) -> float: supply = 0 for drone in self._drones: supply += drone.supply return supply @property - def demand(self) -> int: + def demand(self) -> float: return self._demand @demand.setter - def demand(self, value: int): + def demand(self, value: float): if value > 0: self._demand = value else: @@ -128,7 +128,7 @@ class StaticPool(Pool): :param capacity: Maximum number of pools that can be instantiated within the pool :param resources: Dictionary of resources available for each pool instantiated within the pool """ - def __init__(self, capacity=0, make_drone: Callable=None): + def __init__(self, capacity: float=0, make_drone: Callable=None): assert capacity > 0, "Static pool was initialised without any resources..." super(StaticPool, self).__init__(capacity=capacity, init=capacity, make_drone=make_drone) self._demand = capacity From 68a5065f3108d9f70ff2442d928ab7327e7344d1 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Mon, 8 Apr 2019 14:13:55 +0200 Subject: [PATCH 119/648] added representation for pools and jobs --- lapis/drone.py | 3 +++ lapis/job.py | 3 +++ lapis/pool.py | 4 ++++ 3 files changed, 10 insertions(+) diff --git a/lapis/drone.py b/lapis/drone.py index 91ccf9f..72732b4 100644 --- a/lapis/drone.py +++ b/lapis/drone.py @@ -150,4 +150,7 @@ async def start_job(self, job: Job, kill: bool=False): self._utilisation = self._allocation = None await sampling_required.set(True) + def __repr__(self): + return '<%s: %s>' % (self.__class__.__name__, id(self)) + diff --git a/lapis/job.py b/lapis/job.py index 2c49eaf..b6eaec9 100644 --- a/lapis/job.py +++ b/lapis/job.py @@ -88,6 +88,9 @@ async def run(self): "job_wall_time": self.walltime or self.requested_walltime }) + def __repr__(self): + return '<%s: %s>' % (self.__class__.__name__, self._name or id(self)) + def job_property_generator(**kwargs): while True: diff --git a/lapis/pool.py b/lapis/pool.py index 8472d5c..e0cee8d 100644 --- a/lapis/pool.py +++ b/lapis/pool.py @@ -119,6 +119,10 @@ def demand(self, value: float): else: self._demand = 0 + def __repr__(self): + return '<%s: %s>' % ( + self.__class__.__name__, self._name or id(self)) + class StaticPool(Pool): """ From 28f35e9c55215ef0e7450beef80f7ac7e6b246d9 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Mon, 8 Apr 2019 14:14:34 +0200 Subject: [PATCH 120/648] added exclusive parameter for drone --- lapis/drone.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/lapis/drone.py b/lapis/drone.py index 72732b4..560ee4d 100644 --- a/lapis/drone.py +++ b/lapis/drone.py @@ -11,7 +11,13 @@ class ResourcesExceeded(Exception): class Drone(interfaces.Pool): - def __init__(self, scheduler, pool_resources: dict, scheduling_duration: float): + def __init__(self, scheduler, pool_resources: dict, scheduling_duration: float, exclusive: bool=False): + """ + :param scheduler: + :param pool_resources: + :param scheduling_duration: + :param exclusive: Determines if the drone is used exclusively by jobs in sequential order + """ super(Drone, self).__init__() self.scheduler = scheduler self.pool_resources = pool_resources @@ -24,6 +30,7 @@ def __init__(self, scheduler, pool_resources: dict, scheduling_duration: float): self.scheduler.register_drone(self) else: self._supply = 0 + self.exclusive = exclusive self.jobs = 0 self._allocation = None self._utilisation = None From 442cb15681cb7039765212b038292fb2f9ae425a Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Mon, 8 Apr 2019 14:34:09 +0200 Subject: [PATCH 121/648] added simple test for htcondor pool and job --- lapis/pool_io/htcondor.py | 2 +- lapis_tests/data/htcondor_pools.csv | 5 +++++ lapis_tests/pool_io/__init__.py | 0 lapis_tests/pool_io/test_htcondor.py | 24 ++++++++++++++++++++++++ lapis_tests/test_job.py | 17 +++++++++++++++++ 5 files changed, 47 insertions(+), 1 deletion(-) create mode 100644 lapis_tests/data/htcondor_pools.csv create mode 100644 lapis_tests/pool_io/__init__.py create mode 100644 lapis_tests/pool_io/test_htcondor.py create mode 100644 lapis_tests/test_job.py diff --git a/lapis/pool_io/htcondor.py b/lapis/pool_io/htcondor.py index 81432a2..4b298d9 100644 --- a/lapis/pool_io/htcondor.py +++ b/lapis/pool_io/htcondor.py @@ -17,7 +17,7 @@ def htcondor_pool_reader(iterable, resource_name_mapping: dict={ :param resource_name_mapping: Mapping from given header names to well-defined resources in simulation :param pool_type: The type of pool to be yielded :param make_drone: - :return: Yields the :py:class:`StaticPool`s found in the given iterable + :return: Yields the :py:class:`Pool`s found in the given iterable """ assert make_drone reader = csv.DictReader(iterable, delimiter=' ', skipinitialspace=True) diff --git a/lapis_tests/data/htcondor_pools.csv b/lapis_tests/data/htcondor_pools.csv new file mode 100644 index 0000000..82ffa21 --- /dev/null +++ b/lapis_tests/data/htcondor_pools.csv @@ -0,0 +1,5 @@ + Count TotalSlotCPUs TotalSlotDisk TotalSlotMemory + 2 2 224400.0 8000 + 2 2 223100.0 8000 + 1 8 196300.0 32200 + 1 4 29700.0 8000 \ No newline at end of file diff --git a/lapis_tests/pool_io/__init__.py b/lapis_tests/pool_io/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/lapis_tests/pool_io/test_htcondor.py b/lapis_tests/pool_io/test_htcondor.py new file mode 100644 index 0000000..02835ee --- /dev/null +++ b/lapis_tests/pool_io/test_htcondor.py @@ -0,0 +1,24 @@ +import os + +import pytest + +from lapis.pool_io.htcondor import htcondor_pool_reader + + +def data_path(): + return os.path.join(os.path.dirname(__file__), "..", "data", "htcondor_pools.csv") + + +class TestHtcondorPoolReader(object): + def test_init(self): + with open(data_path()) as input_file: + with pytest.raises(AssertionError): + next(htcondor_pool_reader(input_file)) + + def test_simple(self): + with open(data_path()) as input_file: + pools = 0 + for pool in htcondor_pool_reader(input_file, make_drone=lambda: None): + assert pool is not None + pools += 1 + assert pools > 0 diff --git a/lapis_tests/test_job.py b/lapis_tests/test_job.py new file mode 100644 index 0000000..a985a5b --- /dev/null +++ b/lapis_tests/test_job.py @@ -0,0 +1,17 @@ +import pytest + +from lapis.job import Job + + +class TestJob(object): + def test_init(self): + with pytest.raises(AssertionError): + Job({}, {}) + assert Job({}, {"walltime": 100}) + + def test_name(self): + name = "test" + job = Job({}, {"walltime": 100}, name=name) + assert job.name == name + job = Job({}, {"walltime": 100}) + assert job.name == id(job) From d0deaf8f2fdde5e3f6161d35e9d35faf53e34cb7 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Mon, 8 Apr 2019 15:17:09 +0200 Subject: [PATCH 122/648] removed outdated function --- lapis/job.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/lapis/job.py b/lapis/job.py index b6eaec9..4c46ef8 100644 --- a/lapis/job.py +++ b/lapis/job.py @@ -92,11 +92,6 @@ def __repr__(self): return '<%s: %s>' % (self.__class__.__name__, self._name or id(self)) -def job_property_generator(**kwargs): - while True: - yield 10, {"memory": 8, "cores": 1, "disk": 100} - - async def job_to_queue_scheduler(job_generator, job_queue, **kwargs): job = next(job_generator) base_date = job.queue_date From 64d65920a41229314846ff43d7d62395a00b5319 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Mon, 8 Apr 2019 16:57:05 +0200 Subject: [PATCH 123/648] at drone level selected resources can now be ignored for utilisation and allocation --- lapis/drone.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/lapis/drone.py b/lapis/drone.py index 560ee4d..da160df 100644 --- a/lapis/drone.py +++ b/lapis/drone.py @@ -11,7 +11,8 @@ class ResourcesExceeded(Exception): class Drone(interfaces.Pool): - def __init__(self, scheduler, pool_resources: dict, scheduling_duration: float, exclusive: bool=False): + def __init__(self, scheduler, pool_resources: dict, scheduling_duration: float, exclusive: bool=False, + ignore_resources: list=None): """ :param scheduler: :param pool_resources: @@ -22,6 +23,7 @@ def __init__(self, scheduler, pool_resources: dict, scheduling_duration: float, self.scheduler = scheduler self.pool_resources = pool_resources self.resources = {resource: 0 for resource in self.pool_resources} + self._valid_resource_keys = [resource for resource in self.pool_resources if resource not in ignore_resources] # shadowing requested resources to determine jobs to be killed self.used_resources = {resource: 0 for resource in self.pool_resources} self.scheduling_duration = scheduling_duration @@ -68,8 +70,8 @@ def allocation(self) -> float: def _init_allocation_and_utilisation(self): resources = [] - for resource_key, value in self.used_resources.items(): - resources.append(value / self.pool_resources[resource_key]) + for resource_key in self._valid_resource_keys: + resources.append(self.resources[resource_key] / self.pool_resources[resource_key]) self._allocation = max(resources) self._utilisation = min(resources) From b22e1bb64fbf91f4a08353d6de610df9da0be3c5 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Mon, 8 Apr 2019 17:31:20 +0200 Subject: [PATCH 124/648] pool now regularises its supply with regard to current demand --- lapis/pool.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/lapis/pool.py b/lapis/pool.py index e0cee8d..dce1d07 100644 --- a/lapis/pool.py +++ b/lapis/pool.py @@ -59,16 +59,17 @@ async def run(self): scope.do(drone.run()) self._drones.append(drone) self.put(1) - if self.level > self._demand: - for drone in self.drones: # only consider drones, that supply resources + if self.level > self._demand and self.level > 1: + empty_drone_found = False + for drone in self.drones: if drone.jobs == 0: + empty_drone_found = True break - else: - break - self.get(1) - self._drones.remove(drone) - scope.do(drone.shutdown()) - del drone + if empty_drone_found: + self.get(1) + self._drones.remove(drone) + scope.do(drone.shutdown()) + del drone await (time + 1) @property From eb10504ba7d7289b0763915dc09010e3bf3adfd2 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Mon, 8 Apr 2019 17:32:41 +0200 Subject: [PATCH 125/648] changed format of logging to include representation of pool and jobs --- lapis/drone.py | 6 +++++- lapis/job.py | 11 ++++++++--- lapis/utility/monitor.py | 8 ++++---- 3 files changed, 17 insertions(+), 8 deletions(-) diff --git a/lapis/drone.py b/lapis/drone.py index da160df..e784028 100644 --- a/lapis/drone.py +++ b/lapis/drone.py @@ -151,7 +151,11 @@ async def start_job(self, job: Job, kill: bool=False): usage = job.used_resources.get(resource_key, None) or job.resources.get(resource_key, None) value = usage / (job.resources.get(resource_key, None) or self.pool_resources[resource_key]) if value > 1: - logging.info(str(round(time.now)), {"job_exceeds_%s" % resource_key: value}) + logging.info(str(round(time.now)), { + "job_exceeds_%s" % resource_key: { + repr(job): value + } + }) else: self.jobs -= 1 self._remove_resources(job_keys, self.resources, job.resources, job.used_resources) diff --git a/lapis/job.py b/lapis/job.py index 4c46ef8..286b5a4 100644 --- a/lapis/job.py +++ b/lapis/job.py @@ -80,12 +80,17 @@ def waiting_time(self) -> float: async def run(self): self.in_queue_until = time.now logging.info(str(round(time.now)), { - "job_queue_time": self.queue_date, - "job_waiting_time": self.waiting_time + "job_queue_time": { + repr(self): self.queue_date + }, "job_waiting_time": { + repr(self): self.waiting_time + } }) await (time + self.walltime or self.requested_walltime) logging.info(str(round(time.now)), { - "job_wall_time": self.walltime or self.requested_walltime + "job_wall_time": { + repr(self): self.walltime or self.requested_walltime + } }) def __repr__(self): diff --git a/lapis/utility/monitor.py b/lapis/utility/monitor.py index 8a012f1..075081b 100644 --- a/lapis/utility/monitor.py +++ b/lapis/utility/monitor.py @@ -93,10 +93,10 @@ def collect_pool_statistics(simulator: Simulator) -> dict: pool_utilisation = {} pool_allocation = {} for pool in simulator.pools: - pool_demand[id(pool)] = pool.demand - pool_supply[id(pool)] = pool.supply - pool_utilisation[id(pool)] = pool.utilisation - pool_allocation[id(pool)] = pool.allocation + pool_demand[repr(pool)] = pool.demand + pool_supply[repr(pool)] = pool.supply + pool_utilisation[repr(pool)] = pool.utilisation + pool_allocation[repr(pool)] = pool.allocation return { "pool": { "demand": pool_demand, From 37b4377e37339b0d2ef2e485a9d9a641c3e246b3 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Mon, 8 Apr 2019 17:33:50 +0200 Subject: [PATCH 126/648] fixed mistake where wasted resources for resource type not required by job is calculated --- lapis/scheduler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapis/scheduler.py b/lapis/scheduler.py index 59280d3..7c6303b 100644 --- a/lapis/scheduler.py +++ b/lapis/scheduler.py @@ -64,7 +64,7 @@ def _schedule_job(self, job) -> Drone: if resource_type not in drone.resources.keys(): cost = float("Inf") elif resource_type not in job.resources: - cost += drone.resources[resource_type] - drone.resources[resource_type] + cost += drone.pool_resources[resource_type] - drone.resources[resource_type] elif (drone.pool_resources[resource_type] - drone.resources[resource_type]) < \ job.resources[resource_type]: cost = float("Inf") From 3f3952ce5ddfa243fef5cb175b17734f9b9b8031 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Mon, 8 Apr 2019 17:39:51 +0200 Subject: [PATCH 127/648] adapted comment --- lapis/drone.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapis/drone.py b/lapis/drone.py index e784028..5700977 100644 --- a/lapis/drone.py +++ b/lapis/drone.py @@ -126,7 +126,7 @@ async def start_job(self, job: Job, kill: bool=False): except ResourcesExceeded: job_execution.cancel() try: - # TODO: should we really kill the job if it is only about resources and not used resources? + # TODO: we should allow for overbooking of resources self._add_resources(job_keys, self.resources, job.resources, job.used_resources) except ResourcesExceeded: job_execution.cancel() From ee705e399daa86c84261adbcde368acd6b3f60e9 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Tue, 9 Apr 2019 11:59:24 +0200 Subject: [PATCH 128/648] added unit conversion to change units from kB, KiB, MiB, MB, to GB --- lapis/job_io/htcondor.py | 41 +++++++++++++++++++------------ lapis/job_io/swf.py | 51 +++++++++++++++++++++++++++------------ lapis/pool_io/htcondor.py | 11 ++++++--- 3 files changed, 69 insertions(+), 34 deletions(-) diff --git a/lapis/job_io/htcondor.py b/lapis/job_io/htcondor.py index 3a061db..7c38bce 100644 --- a/lapis/job_io/htcondor.py +++ b/lapis/job_io/htcondor.py @@ -6,15 +6,25 @@ def htcondor_job_reader(iterable, resource_name_mapping={ "cores": "RequestCpus", - "walltime": "RequestWalltime", - "memory": "RequestMemory", - "disk": "RequestDisk" + "walltime": "RequestWalltime", # s + "memory": "RequestMemory", # MiB + "disk": "RequestDisk" # KiB }, used_resource_name_mapping={ "queuetime": "QDate", - "walltime": "RemoteWallClockTime", + "walltime": "RemoteWallClockTime", # s "cores": "Number of Allocated Processors", - "memory": "MemoryUsage", - "disk": "DiskUsage_RAW" + "memory": "MemoryUsage", # MB + "disk": "DiskUsage_RAW" # KiB +}, unit_conversion_mapping={ + "RequestCpus": 1, + "RequestWalltime": 1, + "RequestMemory": 1.024/1024, + "RequestDisk": 1.024/1024/1024, + "queuetime": 1, + "RemoteWallClockTime": 1, + "Number of Allocated Processors": 1, + "MemoryUsage": 1/1024, + "DiskUsage_RAW": 1.024/1024/1024 }): htcondor_reader = csv.DictReader(iterable, delimiter=' ', quotechar="'") @@ -23,17 +33,18 @@ def htcondor_job_reader(iterable, resource_name_mapping={ logging.getLogger("implementation").warning("removed job from htcondor import", row) continue resources = {} - for key in resource_name_mapping: + for key, original_key in resource_name_mapping.items(): try: - resources[key] = float(row[resource_name_mapping[key]]) + resources[key] = float(row[original_key]) * unit_conversion_mapping.get(original_key, 1) except ValueError: pass + used_resources = {"cores": (float(row["RemoteSysCpu"]) + float(row["RemoteUserCpu"]) / + float(row[used_resource_name_mapping["walltime"]])) * unit_conversion_mapping.get( + used_resource_name_mapping[key], 1)} + for key in ["memory", "walltime", "disk"]: + original_key = used_resource_name_mapping[key] + used_resources[key] = float(row[original_key]) * unit_conversion_mapping.get(original_key, 1) yield Job( resources=resources, - used_resources={ - "cores": (float(row["RemoteSysCpu"]) + float(row["RemoteUserCpu"])) / - float(row[used_resource_name_mapping["walltime"]]), - "memory": float(row[used_resource_name_mapping["memory"]]), - "walltime": float(row[used_resource_name_mapping["walltime"]]), - "disk": float(row[used_resource_name_mapping["disk"]]) - }, queue_date=float(row[used_resource_name_mapping["queuetime"]])) + used_resources=used_resources, + queue_date=float(row[used_resource_name_mapping["queuetime"]])) diff --git a/lapis/job_io/swf.py b/lapis/job_io/swf.py index 5493e98..35b6b9a 100644 --- a/lapis/job_io/swf.py +++ b/lapis/job_io/swf.py @@ -1,3 +1,8 @@ +""" +Import of jobs from the parallel workload archive. +Current implementation is based on version 2.2 of the +[Standard Workload Format](http://www.cs.huji.ac.il/labs/parallel/workload/swf.html). +""" import csv from lapis.job import Job @@ -12,18 +17,21 @@ def swf_job_reader(iterable, resource_name_mapping={ "cores": "Number of Allocated Processors", "memory": "Used Memory", "queuetime": "Submit Time" +}, unit_conversion_mapping={ + "Used Memory": 1/1024/1024, + "Requested Memory": 1/2114/1024 }): header = { "Job Number": 0, "Submit Time": 1, - "Wait Time": 2, - "Run Time": 3, + "Wait Time": 2, # s + "Run Time": 3, # s "Number of Allocated Processors": 4, - "Average CPU Time Used": 5, - "Used Memory": 6, + "Average CPU Time Used": 5, # s + "Used Memory": 6, # average kB per processor "Requested Number of Processors": 7, "Requested Time": 8, - "Requested Memory": 9, + "Requested Memory": 9, # kB per processor "Status": 10, "User ID": 11, "Group ID": 12, @@ -31,18 +39,29 @@ def swf_job_reader(iterable, resource_name_mapping={ "Queue Number": 14, "Partition Number": 15, "Preceding Job Number": 16, - "Think Time from Preceding Job": 17 + "Think Time from Preceding Job": 17 # s } reader = csv.reader((line for line in iterable if line[0] != ';'), delimiter=' ', skipinitialspace=True) for row in reader: + resources = {} + used_resources = {} + for key in ["cores", "walltime"]: + value = float(row[header[resource_name_mapping[key]]]) + used_value = float(row[header[used_resource_name_mapping[key]]]) + if value >= 0: + resources[key] = value * used_resource_name_mapping.get(resource_name_mapping[key], 1) + if used_value >= 0: + used_resources[key] = used_value * used_resource_name_mapping.get(used_resource_name_mapping[key], 1) + resources[key] = float() + # handle memory + key = "memory" + resources[key] = (float(row[header[resource_name_mapping[key]]]) * float( + row[header[resource_name_mapping["cores"]]])) * unit_conversion_mapping.get(resource_name_mapping[key], 1) + used_resources[key] = (float(row[header[used_resource_name_mapping[key]]]) * float( + row[header[used_resource_name_mapping["cores"]]])) * unit_conversion_mapping.get(used_resource_name_mapping[key], 1) + yield Job( - resources={ - key: float(row[header[resource_name_mapping[key]]]) - for key in ("cores", "memory", "walltime") - if float(row[header[resource_name_mapping[key]]]) >= 0 - }, - used_resources={ - key: float(row[header[used_resource_name_mapping[key]]]) - for key in ("cores", "memory", "walltime") - if float(row[header[used_resource_name_mapping[key]]]) >= 0 - }, queue_date=float(row[header[used_resource_name_mapping["queuetime"]]]), name=row[header["Job Number"]]) + resources=resources, + used_resources=used_resources, + queue_date=float(row[header[used_resource_name_mapping["queuetime"]]]), + name=row[header["Job Number"]]) diff --git a/lapis/pool_io/htcondor.py b/lapis/pool_io/htcondor.py index 4b298d9..2c6afbf 100644 --- a/lapis/pool_io/htcondor.py +++ b/lapis/pool_io/htcondor.py @@ -7,8 +7,12 @@ def htcondor_pool_reader(iterable, resource_name_mapping: dict={ "cores": "TotalSlotCPUs", - "disk": "TotalSlotDisk", - "memory": "TotalSlotMemory" + "disk": "TotalSlotDisk", # KiB + "memory": "TotalSlotMemory" # MiB +}, unit_conversion_mapping={ + "TotalSlotCPUs": 1, + "TotalSlotDisk": 1.024/1024/1024, + "TotalSlotMemory": 1.024/1024 }, pool_type: Callable=Pool, make_drone: Callable=None): """ Load a pool configuration that was exported via htcondor from files or iterables @@ -24,4 +28,5 @@ def htcondor_pool_reader(iterable, resource_name_mapping: dict={ for row_idx, row in enumerate(reader): yield pool_type( capacity=int(row["Count"]), - make_drone=partial(make_drone, {key: float(row[value]) for key, value in resource_name_mapping.items()})) + make_drone=partial(make_drone, {key: float(row[value]) * unit_conversion_mapping.get(value, 1) + for key, value in resource_name_mapping.items()})) From cc445e02feb5d92b67d1e68edfd00e81f5315c41 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Tue, 9 Apr 2019 20:41:45 +0200 Subject: [PATCH 129/648] added tests for jobs --- lapis_tests/test_job.py | 103 ++++++++++++++++++++++++++++++++++++++-- 1 file changed, 99 insertions(+), 4 deletions(-) diff --git a/lapis_tests/test_job.py b/lapis_tests/test_job.py index a985a5b..eefa9f3 100644 --- a/lapis_tests/test_job.py +++ b/lapis_tests/test_job.py @@ -1,17 +1,112 @@ import pytest +from usim import Scope, time +from lapis.drone import Drone from lapis.job import Job +from lapis_tests.utility import via_usim, DummyScheduler class TestJob(object): def test_init(self): with pytest.raises(AssertionError): - Job({}, {}) - assert Job({}, {"walltime": 100}) + Job(resources={}, used_resources={}) + with pytest.raises(AssertionError): + Job(resources={"walltime": 100}, used_resources={}) + assert Job(resources={}, used_resources={"walltime": 100}) + with pytest.raises(AssertionError): + Job(resources={}, used_resources={"walltime": 100}, in_queue_since=-5) def test_name(self): name = "test" - job = Job({}, {"walltime": 100}, name=name) + job = Job(resources={}, used_resources={"walltime": 100}, name=name) assert job.name == name - job = Job({}, {"walltime": 100}) + assert repr(job) == "" % name + job = Job(resources={}, used_resources={"walltime": 100}) assert job.name == id(job) + assert repr(job) == "" % id(job) + + @via_usim + async def test_run_job(self): + job = Job(resources={"walltime": 50}, used_resources={"walltime": 10}) + async with Scope() as scope: + scope.do(job.run()) + assert 10 == time + assert 0 == job.waiting_time + assert job.successful + + @via_usim + async def test_job_in_drone(self): + scheduler = DummyScheduler() + job = Job( + resources={"walltime": 50, "cores": 1, "memory": 1}, + used_resources={"walltime": 10, "cores": 1, "memory": 1}) + drone = Drone( + scheduler=scheduler, + pool_resources={"cores": 1, "memory": 1}, + scheduling_duration=0) + async with Scope() as scope: + scope.do(drone.start_job(job=job)) + assert 10 == time + assert 0 == job.waiting_time + assert job.successful + + @via_usim + async def test_nonmatching_job_in_drone(self): + scheduler = DummyScheduler() + job = Job( + resources={"walltime": 50, "cores": 2, "memory": 1}, + used_resources={"walltime": 10, "cores": 1, "memory": 1}) + drone = Drone( + scheduler=scheduler, + pool_resources={"cores": 1, "memory": 1}, + scheduling_duration=0) + async with Scope() as scope: + scope.do(drone.start_job(job=job)) + assert 0 == time + assert not job.successful + assert 0 == job.waiting_time + + @via_usim + async def test_two_nonmatching_jobs(self): + scheduler = DummyScheduler() + job_one = Job( + resources={"walltime": 50, "cores": 1, "memory": 1}, + used_resources={"walltime": 10, "cores": 1, "memory": 1}) + job_two = Job( + resources={"walltime": 50, "cores": 1, "memory": 1}, + used_resources={"walltime": 10, "cores": 1, "memory": 1}) + drone = Drone( + scheduler=scheduler, + pool_resources={"cores": 1, "memory": 1}, + scheduling_duration=0) + async with Scope() as scope: + scope.do(drone.start_job(job=job_one)) + scope.do(drone.start_job(job=job_two)) + assert 10 == time + assert job_one.successful + assert not job_two.successful + assert 0 == job_one.waiting_time + assert 0 == job_two.waiting_time + + @via_usim + async def test_two_matching_jobs(self): + scheduler = DummyScheduler() + job_one = Job( + resources={"walltime": 50, "cores": 1, "memory": 1}, + used_resources={"walltime": 10, "cores": 1, "memory": 1}) + job_two = Job( + resources={"walltime": 50, "cores": 1, "memory": 1}, + used_resources={"walltime": 10, "cores": 1, "memory": 1}) + drone = Drone( + scheduler=scheduler, + pool_resources={"cores": 2, "memory": 2}, + scheduling_duration=0) + async with Scope() as scope: + scope.do(drone.start_job(job=job_one)) + scope.do(drone.start_job(job=job_two)) + assert 10 == time + assert job_one.successful + assert job_two.successful + assert 0 == job_one.waiting_time + assert 0 == job_two.waiting_time + From 9b50d56a1a0d3995ccd6a34961019e230135e5e1 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Tue, 9 Apr 2019 20:42:30 +0200 Subject: [PATCH 130/648] ignore resources now also works for empty lists --- lapis/drone.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/lapis/drone.py b/lapis/drone.py index 5700977..23b8670 100644 --- a/lapis/drone.py +++ b/lapis/drone.py @@ -23,7 +23,10 @@ def __init__(self, scheduler, pool_resources: dict, scheduling_duration: float, self.scheduler = scheduler self.pool_resources = pool_resources self.resources = {resource: 0 for resource in self.pool_resources} - self._valid_resource_keys = [resource for resource in self.pool_resources if resource not in ignore_resources] + if ignore_resources: + self._valid_resource_keys = [resource for resource in self.pool_resources if resource not in ignore_resources] + else: + self._valid_resource_keys = self.pool_resources.keys() # shadowing requested resources to determine jobs to be killed self.used_resources = {resource: 0 for resource in self.pool_resources} self.scheduling_duration = scheduling_duration From 4e69b5d057f00f5956ff6ea02db3aa73364c6757 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Tue, 9 Apr 2019 20:43:53 +0200 Subject: [PATCH 131/648] jobs now react on cancellation --- lapis/drone.py | 2 +- lapis/job.py | 36 ++++++++++++++++++++++++++++-------- 2 files changed, 29 insertions(+), 9 deletions(-) diff --git a/lapis/drone.py b/lapis/drone.py index 23b8670..6486370 100644 --- a/lapis/drone.py +++ b/lapis/drone.py @@ -122,6 +122,7 @@ async def start_job(self, job: Job, kill: bool=False): self._utilisation = self._allocation = None job_execution = scope.do(job.run()) + await instant # waiting just a moment to enable job to set parameters job_keys = {*job.resources, *job.used_resources} try: @@ -144,7 +145,6 @@ async def start_job(self, job: Job, kill: bool=False): except KeyError: # check is not relevant if the data is not stored pass - await instant # waiting just a moment to enable job to set parameters if job_execution.status != ActivityState.CANCELLED: self.jobs += 1 await sampling_required.set(True) diff --git a/lapis/job.py b/lapis/job.py index 286b5a4..d22fd33 100644 --- a/lapis/job.py +++ b/lapis/job.py @@ -6,6 +6,9 @@ # TODO: needs refactoring +from usim._primitives.activity import CancelActivity + + def job_demand(simulator): """ function randomly sets global user demand by using different strategies @@ -37,7 +40,7 @@ def job_demand(simulator): class Job(object): __slots__ = ("resources", "used_resources", "walltime", "requested_walltime", "queue_date", "in_queue_since", - "in_queue_until", "_name") + "in_queue_until", "_name", "_success") def __init__(self, resources: dict, used_resources: dict, in_queue_since: float=0, queue_date: float=0, name: str=None): @@ -56,16 +59,22 @@ def __init__(self, resources: dict, used_resources: dict, in_queue_since: float= self.used_resources = used_resources self.walltime = used_resources.pop("walltime", None) self.requested_walltime = resources.pop("walltime", None) - assert self.walltime or self.requested_walltime, "Job does not provide any walltime" + assert self.walltime, "Job does not provide any walltime" self.queue_date = queue_date + assert in_queue_since >= 0, "Queue time cannot be negative" self.in_queue_since = in_queue_since self.in_queue_until = None self._name = name + self._success = False @property def name(self) -> str: return self._name or id(self) + @property + def successful(self) -> bool: + return self._success + @property def waiting_time(self) -> float: """ @@ -86,12 +95,23 @@ async def run(self): repr(self): self.waiting_time } }) - await (time + self.walltime or self.requested_walltime) - logging.info(str(round(time.now)), { - "job_wall_time": { - repr(self): self.walltime or self.requested_walltime - } - }) + try: + await (time + self.walltime) + except CancelActivity: + self._success = False + except BaseException as err: + self._success = False + raise + else: + logging.info(str(round(time.now)), { + "job_wall_time": { + repr(self): self.walltime + } + }) + self._success = True + finally: + # release acquired resources + pass def __repr__(self): return '<%s: %s>' % (self.__class__.__name__, self._name or id(self)) From 0f06be4def1cba13e5c01c44e8e0cfd432909caa Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Tue, 9 Apr 2019 20:44:17 +0200 Subject: [PATCH 132/648] utilities for tests --- lapis_tests/utility.py | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) create mode 100644 lapis_tests/utility.py diff --git a/lapis_tests/utility.py b/lapis_tests/utility.py new file mode 100644 index 0000000..161f45a --- /dev/null +++ b/lapis_tests/utility.py @@ -0,0 +1,40 @@ +from typing import Callable, Coroutine +from functools import wraps + +from usim import run +from usim._core.loop import ActivityError + +from lapis.drone import Drone + + +def via_usim(test_case: Callable[..., Coroutine]): + """ + Mark an ``async def`` test case to be run via ``usim.run`` + + .. code:: python3 + + @via_usim + async def test_sleep(): + before = time.now + await (time + 20) + after = time.now + assert after - before == 20 + """ + @wraps(test_case) + def run_test(*args, **kwargs): + # pytest currently ignores __tracebackhide__ if we re-raise + # https://github.com/pytest-dev/pytest/issues/1904 + __tracebackhide__ = True + # >>> This is not the frame you are looking for. Do read on. <<< + try: + return run(test_case(*args, **kwargs)) + except ActivityError as err: + # unwrap any exceptions + raise err.__cause__ + return run_test + + +class DummyScheduler(): + @staticmethod + def register_drone(drone: Drone): + pass From bd0db063171f2df097a68eb45748197ed1e13bf7 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Tue, 9 Apr 2019 20:54:44 +0200 Subject: [PATCH 133/648] omitting __about__.py from coverage --- .coveragerc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.coveragerc b/.coveragerc index 6f17f19..16c8bfe 100644 --- a/.coveragerc +++ b/.coveragerc @@ -3,6 +3,8 @@ source = lapis branch = TRUE cover_pylib = FALSE parallel = False +omit = + __about__.py [report] exclude_lines = From aed2d718bfef02fb5059bbdddad58fc93be6419c Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Tue, 9 Apr 2019 21:04:01 +0200 Subject: [PATCH 134/648] fixed path of about file --- .coveragerc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.coveragerc b/.coveragerc index 16c8bfe..217073f 100644 --- a/.coveragerc +++ b/.coveragerc @@ -4,7 +4,7 @@ branch = TRUE cover_pylib = FALSE parallel = False omit = - __about__.py + lapis/__about__.py [report] exclude_lines = From 477471e68acaf328dee97c834cdcd6eb826e79c4 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Tue, 9 Apr 2019 21:09:29 +0200 Subject: [PATCH 135/648] added test for infitinity waiting time --- lapis_tests/test_job.py | 1 + 1 file changed, 1 insertion(+) diff --git a/lapis_tests/test_job.py b/lapis_tests/test_job.py index eefa9f3..11818b7 100644 --- a/lapis_tests/test_job.py +++ b/lapis_tests/test_job.py @@ -28,6 +28,7 @@ def test_name(self): @via_usim async def test_run_job(self): job = Job(resources={"walltime": 50}, used_resources={"walltime": 10}) + assert float("inf") == job.waiting_time async with Scope() as scope: scope.do(job.run()) assert 10 == time From 58b25d353b36e6901c710edde73cdfd550e38b21 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Thu, 11 Apr 2019 15:43:45 +0200 Subject: [PATCH 136/648] starting simulations from cli now with linear controller --- lapis/cli/simulate.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/lapis/cli/simulate.py b/lapis/cli/simulate.py index 001ca87..4005f6b 100644 --- a/lapis/cli/simulate.py +++ b/lapis/cli/simulate.py @@ -3,7 +3,7 @@ from cobald.monitor.format_json import JsonFormatter -from lapis.controller import SimulatedCostController +from lapis.controller import SimulatedLinearController from lapis.job_io.htcondor import htcondor_job_reader from lapis.pool import StaticPool, Pool from lapis.pool_io.htcondor import htcondor_pool_reader @@ -35,7 +35,7 @@ def makePickle(self, record): @click.group() @click.option("--seed", type=int, default=1234) -@click.option("--until", default=2000) +@click.option("--until", type=float) @click.option("--log-tcp", "log_tcp", is_flag=True) @click.option("--log-file", "log_file", type=click.File("w")) @click.pass_context @@ -88,7 +88,7 @@ def dynamic(ctx, job_file, pool_file): pool_input=file, pool_reader=pool_import_mapper[file_type], pool_type=Pool, - controller=SimulatedCostController) + controller=SimulatedLinearController) simulator.run(until=ctx.obj["until"]) @@ -108,7 +108,11 @@ def hybrid(ctx, job_file, static_pool_file, dynamic_pool_file): simulator.create_pools(pool_input=file, pool_reader=pool_import_mapper[file_type], pool_type=StaticPool) for current_pool in dynamic_pool_file: file, file_type = current_pool - simulator.create_pools(pool_input=file, pool_reader=pool_import_mapper[file_type], pool_type=Pool, controller=SimulatedCostController) + simulator.create_pools( + pool_input=file, + pool_reader=pool_import_mapper[file_type], + pool_type=Pool, + controller=SimulatedLinearController) simulator.run(until=ctx.obj["until"]) From 5045fada1273a1939455fafe36227073b5331702 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Thu, 11 Apr 2019 15:45:02 +0200 Subject: [PATCH 137/648] added property for theoretically available drone resources --- lapis/drone.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/lapis/drone.py b/lapis/drone.py index 6486370..8f625d1 100644 --- a/lapis/drone.py +++ b/lapis/drone.py @@ -40,6 +40,10 @@ def __init__(self, scheduler, pool_resources: dict, scheduling_duration: float, self._allocation = None self._utilisation = None + @property + def theoretical_available_resources(self): + return {key: self.pool_resources[key] - self.resources[key] for key in self.pool_resources} + async def run(self): from lapis.utility.monitor import sampling_required await (time + self.scheduling_duration) From d6efbc72762aa4012ac1fc5695c19f43fd454d24 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Thu, 11 Apr 2019 15:45:47 +0200 Subject: [PATCH 138/648] utilisation is now 0 when there is no allocation --- lapis/pool.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/lapis/pool.py b/lapis/pool.py index dce1d07..7af72c5 100644 --- a/lapis/pool.py +++ b/lapis/pool.py @@ -95,8 +95,7 @@ def allocation(self) -> float: def utilisation(self) -> float: utilisations = [] for drone in self._drones: - if drone.allocation > 0: - utilisations.append(drone.utilisation) + utilisations.append(drone.utilisation) try: return sum(utilisations) / len(utilisations) except ZeroDivisionError: From 08293fc75b9bc9fd634ee2a35c7988f3515b488f Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Thu, 11 Apr 2019 16:00:58 +0200 Subject: [PATCH 139/648] pool import of total slot disk now calculated in MiB --- lapis/pool_io/htcondor.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lapis/pool_io/htcondor.py b/lapis/pool_io/htcondor.py index 2c6afbf..f2ea5eb 100644 --- a/lapis/pool_io/htcondor.py +++ b/lapis/pool_io/htcondor.py @@ -7,11 +7,11 @@ def htcondor_pool_reader(iterable, resource_name_mapping: dict={ "cores": "TotalSlotCPUs", - "disk": "TotalSlotDisk", # KiB + "disk": "TotalSlotDisk", # MiB "memory": "TotalSlotMemory" # MiB }, unit_conversion_mapping={ "TotalSlotCPUs": 1, - "TotalSlotDisk": 1.024/1024/1024, + "TotalSlotDisk": 1.024/1024, "TotalSlotMemory": 1.024/1024 }, pool_type: Callable=Pool, make_drone: Callable=None): """ From 0b834ff5753a53642ddf36b536cf8c3319af31b8 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Thu, 11 Apr 2019 16:01:36 +0200 Subject: [PATCH 140/648] capacity from pool_io can now also be infinite when defining None in input file --- lapis/pool_io/htcondor.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/lapis/pool_io/htcondor.py b/lapis/pool_io/htcondor.py index f2ea5eb..d6bcaf0 100644 --- a/lapis/pool_io/htcondor.py +++ b/lapis/pool_io/htcondor.py @@ -26,7 +26,13 @@ def htcondor_pool_reader(iterable, resource_name_mapping: dict={ assert make_drone reader = csv.DictReader(iterable, delimiter=' ', skipinitialspace=True) for row_idx, row in enumerate(reader): + try: + capacity = int(row["Count"]) + except ValueError: + if row["Count"] == "None": + capacity = float("Inf") yield pool_type( - capacity=int(row["Count"]), + capacity=capacity, make_drone=partial(make_drone, {key: float(row[value]) * unit_conversion_mapping.get(value, 1) - for key, value in resource_name_mapping.items()})) + for key, value in resource_name_mapping.items()}, + ignore_resources=["disk"])) From bbfaf53b08a4b6e12a71231860605dcd68144562 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Thu, 11 Apr 2019 16:04:52 +0200 Subject: [PATCH 141/648] introduced clustering for handling of drones to determine best matches --- lapis/cost.py | 2 +- lapis/drone.py | 2 ++ lapis/scheduler.py | 51 +++++++++++++++++++++++++++++++++++++++------- 3 files changed, 47 insertions(+), 8 deletions(-) diff --git a/lapis/cost.py b/lapis/cost.py index 4518910..f391ff7 100644 --- a/lapis/cost.py +++ b/lapis/cost.py @@ -1,5 +1,5 @@ def cobald_cost(simulator): - result = len(simulator.job_scheduler.drone_list) + result = len(list(simulator.job_scheduler.drone_list)) for drone in simulator.job_scheduler.drone_list: result += 1 tmp = 0 diff --git a/lapis/drone.py b/lapis/drone.py index 8f625d1..5d7a22d 100644 --- a/lapis/drone.py +++ b/lapis/drone.py @@ -149,6 +149,7 @@ async def start_job(self, job: Job, kill: bool=False): except KeyError: # check is not relevant if the data is not stored pass + self.scheduler.update_drone(self) if job_execution.status != ActivityState.CANCELLED: self.jobs += 1 await sampling_required.set(True) @@ -168,6 +169,7 @@ async def start_job(self, job: Job, kill: bool=False): self._remove_resources(job_keys, self.resources, job.resources, job.used_resources) self._remove_resources(job_keys, self.used_resources, job.used_resources, job.resources) self._utilisation = self._allocation = None + self.scheduler.update_drone(self) await sampling_required.set(True) def __repr__(self): diff --git a/lapis/scheduler.py b/lapis/scheduler.py index 7c6303b..bb43e58 100644 --- a/lapis/scheduler.py +++ b/lapis/scheduler.py @@ -1,6 +1,4 @@ -from collections import deque - -from usim import time, Scope, each +from usim import Scope, each, instant # TODO: does not work anymore as there is no method get_drone at pool from lapis.drone import Drone @@ -31,15 +29,51 @@ class CondorJobScheduler(object): """ def __init__(self, job_queue): self._stream_queue = job_queue - self.drone_list = [] + self.drone_cluster = [] self.interval = 60 self.job_queue = [] + @property + def drone_list(self): + for cluster in self.drone_cluster: + for drone in cluster: + yield drone + def register_drone(self, drone: Drone): - self.drone_list.append(drone) + self._add_drone(drone) def unregister_drone(self, drone: Drone): - self.drone_list.remove(drone) + for cluster in self.drone_cluster: + try: + cluster.remove(drone) + except ValueError: + pass + else: + if len(cluster) == 0: + self.drone_cluster.remove(cluster) + + def _add_drone(self, drone: Drone): + minimum_distance_cluster = None + distance = float("Inf") + if len(self.drone_cluster) > 0: + for cluster in self.drone_cluster: + current_distance = 0 + for key in {*cluster[0].theoretical_available_resources, *drone.theoretical_available_resources}: + current_distance += abs(cluster[0].theoretical_available_resources.get(key, 0) - + drone.theoretical_available_resources.get(key, 0)) + if current_distance < distance: + minimum_distance_cluster = cluster + distance = current_distance + if distance < 1: + minimum_distance_cluster.append(drone) + else: + self.drone_cluster.append([drone]) + else: + self.drone_cluster.append([drone]) + + def update_drone(self, drone: Drone): + self.unregister_drone(drone) + self._add_drone(drone) async def run(self): async with Scope() as scope: @@ -49,6 +83,7 @@ async def run(self): best_match = self._schedule_job(job) if best_match: scope.do(best_match.start_job(job)) + await instant self.job_queue.remove(job) async def _collect_jobs(self): @@ -57,12 +92,14 @@ async def _collect_jobs(self): def _schedule_job(self, job) -> Drone: priorities = {} - for drone in self.drone_list: + for cluster in self.drone_cluster: + drone = cluster[0] cost = 0 resource_types = {*drone.resources.keys(), *job.resources.keys()} for resource_type in resource_types: if resource_type not in drone.resources.keys(): cost = float("Inf") + break elif resource_type not in job.resources: cost += drone.pool_resources[resource_type] - drone.resources[resource_type] elif (drone.pool_resources[resource_type] - drone.resources[resource_type]) < \ From c41d84d0d86175c37ea4da521def3cf08246f152 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Thu, 11 Apr 2019 16:05:41 +0200 Subject: [PATCH 142/648] enabled logging from simulator --- lapis/simulator.py | 15 ++++++++++++++- lapis/utility/monitor.py | 18 ++++++++++-------- 2 files changed, 24 insertions(+), 9 deletions(-) diff --git a/lapis/simulator.py b/lapis/simulator.py index 4057817..bfda294 100644 --- a/lapis/simulator.py +++ b/lapis/simulator.py @@ -1,11 +1,13 @@ import random from functools import partial -from usim import run, time, until +from usim import run, time, until, Scope from usim.basics import Queue from lapis.drone import Drone from lapis.job import job_to_queue_scheduler +from lapis.utility.monitor import Monitoring, collect_pool_statistics, collect_user_demand, collect_job_statistics, \ + collect_resource_statistics, collect_cobald_cost class Simulator(object): @@ -19,6 +21,16 @@ def __init__(self, seed=1234): self.job_generator = None self.cost = 0 self._job_generators = [] + self.monitoring = None + self.enable_monitoring() + + def enable_monitoring(self): + self.monitoring = Monitoring(self) + self.monitoring.register_statistic(collect_pool_statistics) + self.monitoring.register_statistic(collect_user_demand) + self.monitoring.register_statistic(collect_job_statistics) + self.monitoring.register_statistic(collect_resource_statistics) + self.monitoring.register_statistic(collect_cobald_cost) def create_job_generator(self, job_input, job_reader): self._job_generators.append((job_input, job_reader)) @@ -48,6 +60,7 @@ async def _simulate(self, end): while_running.do(self.job_scheduler.run()) for controller in self.controllers: while_running.do(controller.run()) + while_running.do(self.monitoring.run()) print("Finished simulation at %s" % time.now) async def _queue_jobs(self, job_input, job_reader): diff --git a/lapis/utility/monitor.py b/lapis/utility/monitor.py index 075081b..f11ddd7 100644 --- a/lapis/utility/monitor.py +++ b/lapis/utility/monitor.py @@ -1,19 +1,21 @@ from functools import wraps -from typing import Callable +from typing import Callable, TYPE_CHECKING import logging from usim import each, Flag, time from lapis.cost import cobald_cost -from lapis.simulator import Simulator + +if TYPE_CHECKING: + from lapis.simulator import Simulator sampling_required = Flag() class Monitoring(object): # TODO: we need to check how to integrate the normalization factor - def __init__(self, simulator: Simulator): + def __init__(self, simulator: "Simulator"): self.simulator = simulator self._statistics = [] @@ -31,7 +33,7 @@ def register_statistic(self, statistic: Callable): self._statistics.append(statistic) -def collect_resource_statistics(simulator: Simulator) -> dict: +def collect_resource_statistics(simulator: "Simulator") -> dict: empty_drones = 0 drone_resources = {} for drone in simulator.job_scheduler.drone_list: @@ -61,7 +63,7 @@ def collect_resource_statistics(simulator: Simulator) -> dict: } -def collect_cobald_cost(simulator: Simulator) -> dict: +def collect_cobald_cost(simulator: "Simulator") -> dict: current_cost = cobald_cost(simulator) simulator.cost += current_cost return { @@ -72,13 +74,13 @@ def collect_cobald_cost(simulator: Simulator) -> dict: } -def collect_user_demand(simulator: Simulator) -> dict: +def collect_user_demand(simulator: "Simulator") -> dict: return { "user_demand": len(simulator.job_scheduler.job_queue) } -def collect_job_statistics(simulator: Simulator) -> dict: +def collect_job_statistics(simulator: "Simulator") -> dict: result = 0 for drone in simulator.job_scheduler.drone_list: result += drone.jobs @@ -87,7 +89,7 @@ def collect_job_statistics(simulator: Simulator) -> dict: } -def collect_pool_statistics(simulator: Simulator) -> dict: +def collect_pool_statistics(simulator: "Simulator") -> dict: pool_demand = {} pool_supply = {} pool_utilisation = {} From 80b33cff9f3e6ff94d2fa11d9c6629a8b87bcd1e Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Thu, 11 Apr 2019 16:06:14 +0200 Subject: [PATCH 143/648] fixes to simulator to also support simulation until end --- lapis/simulator.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/lapis/simulator.py b/lapis/simulator.py index bfda294..4101012 100644 --- a/lapis/simulator.py +++ b/lapis/simulator.py @@ -40,19 +40,18 @@ def create_pools(self, pool_input, pool_reader, pool_type, controller=None): for pool in pool_reader(iterable=pool_input, pool_type=pool_type, make_drone=partial(Drone, self.job_scheduler)): self.pools.append(pool) if controller: - controller(target=pool, rate=1) - self.controllers.append(controller) + self.controllers.append(controller(target=pool, rate=1)) def create_scheduler(self, scheduler_type): self.job_scheduler = scheduler_type(job_queue=self.job_queue) - def run(self, until=2000): + def run(self, until=None): print("running until", until) run(self._simulate(until)) async def _simulate(self, end): print("Starting simulation at %s" % time.now) - async with until(time == end) as while_running: + async with until(time == end) if end else Scope() as while_running: for pool in self.pools: while_running.do(pool.run()) for job_input, job_reader in self._job_generators: From ac70f6bd00554b786a251efc4cd7d3f77adff947 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Thu, 11 Apr 2019 18:10:03 +0200 Subject: [PATCH 144/648] added required methods for DummyScheduler for tests --- lapis_tests/utility.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/lapis_tests/utility.py b/lapis_tests/utility.py index 161f45a..630289d 100644 --- a/lapis_tests/utility.py +++ b/lapis_tests/utility.py @@ -38,3 +38,11 @@ class DummyScheduler(): @staticmethod def register_drone(drone: Drone): pass + + @staticmethod + def unregister_drone(drone: Drone): + pass + + @staticmethod + def update_drone(drone: Drone): + pass From a8749e63cbe2fafb3a1f045f03ec893d7193df62 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Tue, 7 May 2019 10:17:31 +0200 Subject: [PATCH 145/648] added config for pep8 github integration --- .pep8speaks.yml | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 .pep8speaks.yml diff --git a/.pep8speaks.yml b/.pep8speaks.yml new file mode 100644 index 0000000..65c419a --- /dev/null +++ b/.pep8speaks.yml @@ -0,0 +1,6 @@ +scanner: + diff_only: False # If False, the entire file touched by the Pull Request is scanned for errors. If True, only the diff is scanned. + linter: flake8 + +flake8: + max-line-length: 88 # flake8-bugbear uses 80*1.1 \ No newline at end of file From 2796016f2031b61e781b5db99ab4d7a08b057148 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Tue, 7 May 2019 10:22:10 +0200 Subject: [PATCH 146/648] added flake8 configuration --- setup.cfg | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/setup.cfg b/setup.cfg index b7e4789..2fd4b35 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,2 +1,9 @@ [aliases] test=pytest + +[flake8] +statistics = True +max-line-length = 80 +ignore = E501, B008, B011 +select = C,E,F,W,B,B9 +exclude = docs,.svn,CVS,.bzr,.hg,.git,__pycache__,.tox,.eggs,*.egg \ No newline at end of file From 4fae9b35ca7bbfc6f132122edbf7cef749b99492 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Tue, 7 May 2019 10:55:41 +0200 Subject: [PATCH 147/648] removed unused imports --- lapis/drone.py | 2 +- lapis/utility/monitor.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/lapis/drone.py b/lapis/drone.py index 5d7a22d..0337fa6 100644 --- a/lapis/drone.py +++ b/lapis/drone.py @@ -1,7 +1,7 @@ import logging from cobald import interfaces -from usim import time, Scope, ActivityCancelled, instant, ActivityState +from usim import time, Scope, instant, ActivityState from lapis.job import Job diff --git a/lapis/utility/monitor.py b/lapis/utility/monitor.py index f11ddd7..5ad6a6c 100644 --- a/lapis/utility/monitor.py +++ b/lapis/utility/monitor.py @@ -1,4 +1,3 @@ -from functools import wraps from typing import Callable, TYPE_CHECKING import logging From 617ac6e9f5b9fe67ebd208a33067c84338bb763e Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Tue, 7 May 2019 10:56:06 +0200 Subject: [PATCH 148/648] added W503 to ignored error for flake8 --- setup.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index 2fd4b35..09dbcbb 100644 --- a/setup.cfg +++ b/setup.cfg @@ -4,6 +4,6 @@ test=pytest [flake8] statistics = True max-line-length = 80 -ignore = E501, B008, B011 +ignore = E501, B008, B011, W503 select = C,E,F,W,B,B9 exclude = docs,.svn,CVS,.bzr,.hg,.git,__pycache__,.tox,.eggs,*.egg \ No newline at end of file From a86dcd125d673810689c5aa9aa9245fa080ad8cb Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Tue, 7 May 2019 10:57:12 +0200 Subject: [PATCH 149/648] fixed whitespace errors around operators and equals --- lapis/drone.py | 6 +++--- lapis/job.py | 14 +++++++------- lapis/job_io/htcondor.py | 8 ++++---- lapis/job_io/swf.py | 4 ++-- lapis/pool.py | 6 +++--- lapis/pool_io/htcondor.py | 10 +++++----- lapis/pool_io/machines.py | 4 ++-- setup.py | 2 +- 8 files changed, 27 insertions(+), 27 deletions(-) diff --git a/lapis/drone.py b/lapis/drone.py index 0337fa6..08a3b8d 100644 --- a/lapis/drone.py +++ b/lapis/drone.py @@ -11,8 +11,8 @@ class ResourcesExceeded(Exception): class Drone(interfaces.Pool): - def __init__(self, scheduler, pool_resources: dict, scheduling_duration: float, exclusive: bool=False, - ignore_resources: list=None): + def __init__(self, scheduler, pool_resources: dict, scheduling_duration: float, exclusive: bool = False, + ignore_resources: list = None): """ :param scheduler: :param pool_resources: @@ -111,7 +111,7 @@ def _remove_resources(keys: list, target: dict, source: dict, alternative_source except KeyError: target[resource_key] -= alternative_source[resource_key] - async def start_job(self, job: Job, kill: bool=False): + async def start_job(self, job: Job, kill: bool = False): """ Method manages to start a job in the context of the given drone. The job is started independent of available resources. If resources of drone are exceeded, the job is killed. diff --git a/lapis/job.py b/lapis/job.py index d22fd33..c4bcfdc 100644 --- a/lapis/job.py +++ b/lapis/job.py @@ -18,18 +18,18 @@ def job_demand(simulator): while True: delay = random.randint(0, 100) strategy = random.random() - if strategy < 1/3: + if strategy < 1 / 3: # linear amount # print("strategy: linear amount") - amount = random.randint(0, int(random.random()*100)) - elif strategy < 2/3: + amount = random.randint(0, int(random.random() * 100)) + elif strategy < 2 / 3: # exponential amount # print("strategy: exponential amount") - amount = (math.e**(random.random())-1)*random.random()*1000 + amount = (math.e ** (random.random()) - 1) * random.random() * 1000 else: # sqrt # print("strategy: sqrt amount") - amount = math.sqrt(random.random()*random.random()*100) + amount = math.sqrt(random.random() * random.random() * 100) value = yield simulator.env.timeout(delay=delay, value=amount) value = round(value) if value > 0: @@ -42,8 +42,8 @@ class Job(object): __slots__ = ("resources", "used_resources", "walltime", "requested_walltime", "queue_date", "in_queue_since", "in_queue_until", "_name", "_success") - def __init__(self, resources: dict, used_resources: dict, in_queue_since: float=0, queue_date: float=0, - name: str=None): + def __init__(self, resources: dict, used_resources: dict, in_queue_since: float = 0, queue_date: float = 0, + name: str = None): """ Definition of a job that uses a specified amount of resources `used_resources` over a given amount of time, `walltime`. A job is described by its user via the parameter `resources`. This is a user prediction and is diff --git a/lapis/job_io/htcondor.py b/lapis/job_io/htcondor.py index 7c38bce..761df9c 100644 --- a/lapis/job_io/htcondor.py +++ b/lapis/job_io/htcondor.py @@ -18,13 +18,13 @@ def htcondor_job_reader(iterable, resource_name_mapping={ }, unit_conversion_mapping={ "RequestCpus": 1, "RequestWalltime": 1, - "RequestMemory": 1.024/1024, - "RequestDisk": 1.024/1024/1024, + "RequestMemory": 1.024 / 1024, + "RequestDisk": 1.024 / 1024 / 1024, "queuetime": 1, "RemoteWallClockTime": 1, "Number of Allocated Processors": 1, - "MemoryUsage": 1/1024, - "DiskUsage_RAW": 1.024/1024/1024 + "MemoryUsage": 1 / 1024, + "DiskUsage_RAW": 1.024 / 1024 / 1024 }): htcondor_reader = csv.DictReader(iterable, delimiter=' ', quotechar="'") diff --git a/lapis/job_io/swf.py b/lapis/job_io/swf.py index 35b6b9a..bbb849f 100644 --- a/lapis/job_io/swf.py +++ b/lapis/job_io/swf.py @@ -18,8 +18,8 @@ def swf_job_reader(iterable, resource_name_mapping={ "memory": "Used Memory", "queuetime": "Submit Time" }, unit_conversion_mapping={ - "Used Memory": 1/1024/1024, - "Requested Memory": 1/2114/1024 + "Used Memory": 1 / 1024 / 1024, + "Requested Memory": 1 / 2114 / 1024 }): header = { "Job Number": 0, diff --git a/lapis/pool.py b/lapis/pool.py index 7af72c5..6631a52 100644 --- a/lapis/pool.py +++ b/lapis/pool.py @@ -13,7 +13,7 @@ class Pool(interfaces.Pool): :param name: Name of the pool :param make_drone: Callable to create a drone with specific properties for this pool """ - def __init__(self, capacity: float=float('inf'), init: float=0, name: str=None, make_drone: Callable=None): + def __init__(self, capacity: float = float('inf'), init: float = 0, name: str = None, make_drone: Callable = None): super(Pool, self).__init__() assert make_drone self.make_drone = make_drone @@ -34,7 +34,7 @@ def get(self, amount: float): raise ValueError self.level -= amount - def init_pool(self, init: float=0): + def init_pool(self, init: float = 0): """ Initialisation of existing drones at creation time of pool. @@ -132,7 +132,7 @@ class StaticPool(Pool): :param capacity: Maximum number of pools that can be instantiated within the pool :param resources: Dictionary of resources available for each pool instantiated within the pool """ - def __init__(self, capacity: float=0, make_drone: Callable=None): + def __init__(self, capacity: float = 0, make_drone: Callable = None): assert capacity > 0, "Static pool was initialised without any resources..." super(StaticPool, self).__init__(capacity=capacity, init=capacity, make_drone=make_drone) self._demand = capacity diff --git a/lapis/pool_io/htcondor.py b/lapis/pool_io/htcondor.py index d6bcaf0..05a3cd7 100644 --- a/lapis/pool_io/htcondor.py +++ b/lapis/pool_io/htcondor.py @@ -5,15 +5,15 @@ from ..pool import Pool -def htcondor_pool_reader(iterable, resource_name_mapping: dict={ +def htcondor_pool_reader(iterable, resource_name_mapping: dict = { "cores": "TotalSlotCPUs", "disk": "TotalSlotDisk", # MiB "memory": "TotalSlotMemory" # MiB -}, unit_conversion_mapping={ +}, unit_conversion_mapping: dict = { "TotalSlotCPUs": 1, - "TotalSlotDisk": 1.024/1024, - "TotalSlotMemory": 1.024/1024 -}, pool_type: Callable=Pool, make_drone: Callable=None): + "TotalSlotDisk": 1.024 / 1024, + "TotalSlotMemory": 1.024 / 1024 +}, pool_type: Callable = Pool, make_drone: Callable = None): """ Load a pool configuration that was exported via htcondor from files or iterables diff --git a/lapis/pool_io/machines.py b/lapis/pool_io/machines.py index 91a397e..b5f07df 100644 --- a/lapis/pool_io/machines.py +++ b/lapis/pool_io/machines.py @@ -5,10 +5,10 @@ from ..pool import Pool -def machines_pool_reader(iterable, resource_name_mapping={ +def machines_pool_reader(iterable, resource_name_mapping: dict = { "cores": "CPUs_per_node", "memory": "RAM_per_node_in_KB" -}, pool_type=Pool, make_drone: Callable=None): +}, pool_type: Callable = Pool, make_drone: Callable = None): """ Load a pool configuration that was exported via htcondor from files or iterables diff --git a/setup.py b/setup.py index c7758b0..34aade3 100644 --- a/setup.py +++ b/setup.py @@ -29,7 +29,7 @@ 'click' ], extras_require={ - 'docs': ["sphinx", "sphinxcontrib-tikz"], + 'docs': ["sphinx", "sphinxcontrib-tikz"], }, # metadata for package search license='MIT', From e8c234602194bba9fde4693bd6206c62d4c7ee57 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Tue, 7 May 2019 10:57:42 +0200 Subject: [PATCH 150/648] removed redundant new lines --- lapis/drone.py | 2 -- lapis_tests/test_job.py | 1 - 2 files changed, 3 deletions(-) diff --git a/lapis/drone.py b/lapis/drone.py index 08a3b8d..189ae14 100644 --- a/lapis/drone.py +++ b/lapis/drone.py @@ -174,5 +174,3 @@ async def start_job(self, job: Job, kill: bool = False): def __repr__(self): return '<%s: %s>' % (self.__class__.__name__, id(self)) - - diff --git a/lapis_tests/test_job.py b/lapis_tests/test_job.py index 11818b7..07cb554 100644 --- a/lapis_tests/test_job.py +++ b/lapis_tests/test_job.py @@ -110,4 +110,3 @@ async def test_two_matching_jobs(self): assert job_two.successful assert 0 == job_one.waiting_time assert 0 == job_two.waiting_time - From 0dfdfe61ff03a7513c858fa3aaab85dcda88944f Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Tue, 7 May 2019 10:58:21 +0200 Subject: [PATCH 151/648] removed unused variables and old docstring parameters --- lapis/job.py | 2 +- lapis/scheduler.py | 2 +- lapis/simulator.py | 1 - 3 files changed, 2 insertions(+), 3 deletions(-) diff --git a/lapis/job.py b/lapis/job.py index c4bcfdc..ed41c80 100644 --- a/lapis/job.py +++ b/lapis/job.py @@ -99,7 +99,7 @@ async def run(self): await (time + self.walltime) except CancelActivity: self._success = False - except BaseException as err: + except BaseException: self._success = False raise else: diff --git a/lapis/scheduler.py b/lapis/scheduler.py index bb43e58..431fd58 100644 --- a/lapis/scheduler.py +++ b/lapis/scheduler.py @@ -24,9 +24,9 @@ class CondorJobScheduler(object): is calculated based on the current strategy that is used at GridKa. The scheduler checks if a job either exactly fits a slot or if it does fit into it several times. The cost for putting a job at a given slot is given by the amount of resources that might remain unallocated. - :param env: :return: """ + def __init__(self, job_queue): self._stream_queue = job_queue self.drone_cluster = [] diff --git a/lapis/simulator.py b/lapis/simulator.py index 4101012..d76486e 100644 --- a/lapis/simulator.py +++ b/lapis/simulator.py @@ -13,7 +13,6 @@ class Simulator(object): def __init__(self, seed=1234): random.seed(seed) - resource_normalisation = {"memory": 2000} self.job_queue = Queue() self.pools = [] self.controllers = [] From 314609209693fc7023356960d7e39473cb89872a Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Tue, 7 May 2019 10:59:13 +0200 Subject: [PATCH 152/648] fixed line breaks after binary operator --- lapis/job_io/htcondor.py | 4 ++-- lapis/scheduler.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/lapis/job_io/htcondor.py b/lapis/job_io/htcondor.py index 761df9c..0b9078e 100644 --- a/lapis/job_io/htcondor.py +++ b/lapis/job_io/htcondor.py @@ -38,8 +38,8 @@ def htcondor_job_reader(iterable, resource_name_mapping={ resources[key] = float(row[original_key]) * unit_conversion_mapping.get(original_key, 1) except ValueError: pass - used_resources = {"cores": (float(row["RemoteSysCpu"]) + float(row["RemoteUserCpu"]) / - float(row[used_resource_name_mapping["walltime"]])) * unit_conversion_mapping.get( + used_resources = {"cores": (float(row["RemoteSysCpu"]) + float(row["RemoteUserCpu"]) + / float(row[used_resource_name_mapping["walltime"]])) * unit_conversion_mapping.get( used_resource_name_mapping[key], 1)} for key in ["memory", "walltime", "disk"]: original_key = used_resource_name_mapping[key] diff --git a/lapis/scheduler.py b/lapis/scheduler.py index 431fd58..1b0ef45 100644 --- a/lapis/scheduler.py +++ b/lapis/scheduler.py @@ -59,8 +59,8 @@ def _add_drone(self, drone: Drone): for cluster in self.drone_cluster: current_distance = 0 for key in {*cluster[0].theoretical_available_resources, *drone.theoretical_available_resources}: - current_distance += abs(cluster[0].theoretical_available_resources.get(key, 0) - - drone.theoretical_available_resources.get(key, 0)) + current_distance += abs(cluster[0].theoretical_available_resources.get(key, 0) + - drone.theoretical_available_resources.get(key, 0)) if current_distance < distance: minimum_distance_cluster = cluster distance = current_distance From 38d62a239ac947a1924baee722fb4e3dba2423a7 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Tue, 7 May 2019 10:59:31 +0200 Subject: [PATCH 153/648] fixed hanging indent and missing docstring parameter --- lapis/pool_io/machines.py | 1 + lapis/scheduler.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/lapis/pool_io/machines.py b/lapis/pool_io/machines.py index b5f07df..7b20695 100644 --- a/lapis/pool_io/machines.py +++ b/lapis/pool_io/machines.py @@ -12,6 +12,7 @@ def machines_pool_reader(iterable, resource_name_mapping: dict = { """ Load a pool configuration that was exported via htcondor from files or iterables + :param make_drone: The callable to create the drone :param iterable: an iterable yielding lines of CSV, such as an open file :param resource_name_mapping: Mapping from given header names to well-defined resources in simulation :param pool_type: The type of pool to be yielded diff --git a/lapis/scheduler.py b/lapis/scheduler.py index 1b0ef45..becf4f0 100644 --- a/lapis/scheduler.py +++ b/lapis/scheduler.py @@ -108,7 +108,7 @@ def _schedule_job(self, job) -> Drone: break else: cost += (drone.pool_resources[resource_type] - drone.resources[resource_type]) // \ - job.resources[resource_type] + job.resources[resource_type] cost /= len(resource_types) if cost <= 1: # directly start job From b5e39e848f97733299a88170c388e51aa7fa64bf Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Tue, 7 May 2019 11:36:14 +0200 Subject: [PATCH 154/648] fixed line lengths --- lapis/drone.py | 51 +++++++++++++++++++++++++++------------ lapis/job.py | 22 +++++++++-------- lapis/job_io/htcondor.py | 17 ++++++++----- lapis/job_io/swf.py | 21 ++++++++++------ lapis/pool.py | 29 +++++++++++++--------- lapis/pool_io/htcondor.py | 13 ++++++---- lapis/pool_io/machines.py | 10 +++++--- lapis/scheduler.py | 34 ++++++++++++++++---------- lapis/simulator.py | 8 +++--- lapis/utility/monitor.py | 24 ++++++++++++------ 10 files changed, 147 insertions(+), 82 deletions(-) diff --git a/lapis/drone.py b/lapis/drone.py index 189ae14..4c7e56b 100644 --- a/lapis/drone.py +++ b/lapis/drone.py @@ -11,20 +11,24 @@ class ResourcesExceeded(Exception): class Drone(interfaces.Pool): - def __init__(self, scheduler, pool_resources: dict, scheduling_duration: float, exclusive: bool = False, + def __init__(self, scheduler, pool_resources: dict, + scheduling_duration: float, exclusive: bool = False, ignore_resources: list = None): """ :param scheduler: :param pool_resources: :param scheduling_duration: - :param exclusive: Determines if the drone is used exclusively by jobs in sequential order + :param exclusive: Determines if the drone is used exclusively by jobs + in sequential order """ super(Drone, self).__init__() self.scheduler = scheduler self.pool_resources = pool_resources self.resources = {resource: 0 for resource in self.pool_resources} if ignore_resources: - self._valid_resource_keys = [resource for resource in self.pool_resources if resource not in ignore_resources] + self._valid_resource_keys = [ + resource for resource in self.pool_resources + if resource not in ignore_resources] else: self._valid_resource_keys = self.pool_resources.keys() # shadowing requested resources to determine jobs to be killed @@ -42,7 +46,10 @@ def __init__(self, scheduler, pool_resources: dict, scheduling_duration: float, @property def theoretical_available_resources(self): - return {key: self.pool_resources[key] - self.resources[key] for key in self.pool_resources} + return { + key: self.pool_resources[key] - self.resources[key] + for key in self.pool_resources + } async def run(self): from lapis.utility.monitor import sampling_required @@ -78,7 +85,8 @@ def allocation(self) -> float: def _init_allocation_and_utilisation(self): resources = [] for resource_key in self._valid_resource_keys: - resources.append(self.resources[resource_key] / self.pool_resources[resource_key]) + resources.append( + self.resources[resource_key] / self.pool_resources[resource_key]) self._allocation = max(resources) self._utilisation = min(resources) @@ -90,7 +98,8 @@ async def shutdown(self): await (time + 1) # print("[drone %s] has been shut down" % self) - def _add_resources(self, keys: list, target: dict, source: dict, alternative_source: dict): + def _add_resources(self, keys: list, target: dict, source: dict, + alternative_source: dict): resources_exceeded = False for resource_key in keys: try: @@ -104,7 +113,8 @@ def _add_resources(self, keys: list, target: dict, source: dict, alternative_sou raise ResourcesExceeded() @staticmethod - def _remove_resources(keys: list, target: dict, source: dict, alternative_source: dict): + def _remove_resources(keys: list, target: dict, source: dict, + alternative_source: dict): for resource_key in keys: try: target[resource_key] -= source[resource_key] @@ -114,13 +124,16 @@ def _remove_resources(keys: list, target: dict, source: dict, alternative_source async def start_job(self, job: Job, kill: bool = False): """ Method manages to start a job in the context of the given drone. - The job is started independent of available resources. If resources of drone are exceeded, the job is killed. + The job is started independent of available resources. If resources of + drone are exceeded, the job is killed. :param job: the job to start - :param kill: if True, a job is killed when used resources exceed requested resources + :param kill: if True, a job is killed when used resources exceed + requested resources :return: """ - # TODO: ensure that jobs cannot immediately started on the same drone until the jobs did not allocate resources + # TODO: ensure that jobs cannot immediately started on the same drone + # until the jobs did not allocate resources async with Scope() as scope: from lapis.utility.monitor import sampling_required self._utilisation = self._allocation = None @@ -130,12 +143,14 @@ async def start_job(self, job: Job, kill: bool = False): job_keys = {*job.resources, *job.used_resources} try: - self._add_resources(job_keys, self.used_resources, job.used_resources, job.resources) + self._add_resources( + job_keys, self.used_resources, job.used_resources, job.resources) except ResourcesExceeded: job_execution.cancel() try: # TODO: we should allow for overbooking of resources - self._add_resources(job_keys, self.resources, job.resources, job.used_resources) + self._add_resources( + job_keys, self.resources, job.resources, job.used_resources) except ResourcesExceeded: job_execution.cancel() @@ -156,8 +171,10 @@ async def start_job(self, job: Job, kill: bool = False): await job_execution if job_execution.status == ActivityState.CANCELLED: for resource_key in job_keys: - usage = job.used_resources.get(resource_key, None) or job.resources.get(resource_key, None) - value = usage / (job.resources.get(resource_key, None) or self.pool_resources[resource_key]) + usage = job.used_resources.get(resource_key, None) \ + or job.resources.get(resource_key, None) + value = usage / (job.resources.get(resource_key, None) + or self.pool_resources[resource_key]) if value > 1: logging.info(str(round(time.now)), { "job_exceeds_%s" % resource_key: { @@ -166,8 +183,10 @@ async def start_job(self, job: Job, kill: bool = False): }) else: self.jobs -= 1 - self._remove_resources(job_keys, self.resources, job.resources, job.used_resources) - self._remove_resources(job_keys, self.used_resources, job.used_resources, job.resources) + self._remove_resources( + job_keys, self.resources, job.resources, job.used_resources) + self._remove_resources( + job_keys, self.used_resources, job.used_resources, job.resources) self._utilisation = self._allocation = None self.scheduler.update_drone(self) await sampling_required.set(True) diff --git a/lapis/job.py b/lapis/job.py index ed41c80..2f5872c 100644 --- a/lapis/job.py +++ b/lapis/job.py @@ -35,23 +35,24 @@ def job_demand(simulator): if value > 0: simulator.global_demand.put(value) logging.info(str(round(simulator.env.now)), {"user_demand_new": value}) - # print("[demand] raising user demand for %f at %d to %d" % (value, env.now, globals.global_demand.level)) class Job(object): - __slots__ = ("resources", "used_resources", "walltime", "requested_walltime", "queue_date", "in_queue_since", - "in_queue_until", "_name", "_success") + __slots__ = ("resources", "used_resources", "walltime", "requested_walltime", + "queue_date", "in_queue_since", "in_queue_until", "_name", "_success") - def __init__(self, resources: dict, used_resources: dict, in_queue_since: float = 0, queue_date: float = 0, - name: str = None): + def __init__(self, resources: dict, used_resources: dict, in_queue_since: float = 0, + queue_date: float = 0, name: str = None): """ - Definition of a job that uses a specified amount of resources `used_resources` over a given amount of time, - `walltime`. A job is described by its user via the parameter `resources`. This is a user prediction and is - expected to deviate from `used_resources`. + Definition of a job that uses a specified amount of resources `used_resources` + over a given amount of time, `walltime`. A job is described by its user + via the parameter `resources`. This is a user prediction and is expected + to deviate from `used_resources`. :param resources: Requested resources of the job :param used_resources: Resource usage of the job - :param in_queue_since: Time when job was inserted into the queue of the simulation scheduler + :param in_queue_since: Time when job was inserted into the queue of the + simulation scheduler :param queue_date: Time when job was inserted into queue in real life :param name: Name of the job """ @@ -78,7 +79,8 @@ def successful(self) -> bool: @property def waiting_time(self) -> float: """ - The time the job spent in the simulators scheduling queue. `Inf` when the job is still waitiing. + The time the job spent in the simulators scheduling queue. `Inf` when + the job is still waitiing. :return: Time in queue """ diff --git a/lapis/job_io/htcondor.py b/lapis/job_io/htcondor.py index 0b9078e..a7b2dcf 100644 --- a/lapis/job_io/htcondor.py +++ b/lapis/job_io/htcondor.py @@ -30,20 +30,25 @@ def htcondor_job_reader(iterable, resource_name_mapping={ for row in htcondor_reader: if float(row[used_resource_name_mapping["walltime"]]) <= 0: - logging.getLogger("implementation").warning("removed job from htcondor import", row) + logging.getLogger("implementation").warning( + "removed job from htcondor import", row) continue resources = {} for key, original_key in resource_name_mapping.items(): try: - resources[key] = float(row[original_key]) * unit_conversion_mapping.get(original_key, 1) + resources[key] = float(row[original_key]) \ + * unit_conversion_mapping.get(original_key, 1) except ValueError: pass - used_resources = {"cores": (float(row["RemoteSysCpu"]) + float(row["RemoteUserCpu"]) - / float(row[used_resource_name_mapping["walltime"]])) * unit_conversion_mapping.get( - used_resource_name_mapping[key], 1)} + used_resources = { + "cores": (float(row["RemoteSysCpu"]) + float(row["RemoteUserCpu"]) + / float(row[used_resource_name_mapping["walltime"]])) + * unit_conversion_mapping.get(used_resource_name_mapping[key], 1) + } for key in ["memory", "walltime", "disk"]: original_key = used_resource_name_mapping[key] - used_resources[key] = float(row[original_key]) * unit_conversion_mapping.get(original_key, 1) + used_resources[key] = \ + float(row[original_key]) * unit_conversion_mapping.get(original_key, 1) yield Job( resources=resources, used_resources=used_resources, diff --git a/lapis/job_io/swf.py b/lapis/job_io/swf.py index bbb849f..1fad6eb 100644 --- a/lapis/job_io/swf.py +++ b/lapis/job_io/swf.py @@ -41,7 +41,8 @@ def swf_job_reader(iterable, resource_name_mapping={ "Preceding Job Number": 16, "Think Time from Preceding Job": 17 # s } - reader = csv.reader((line for line in iterable if line[0] != ';'), delimiter=' ', skipinitialspace=True) + reader = csv.reader((line for line in iterable if line[0] != ';'), + delimiter=' ', skipinitialspace=True) for row in reader: resources = {} used_resources = {} @@ -49,16 +50,22 @@ def swf_job_reader(iterable, resource_name_mapping={ value = float(row[header[resource_name_mapping[key]]]) used_value = float(row[header[used_resource_name_mapping[key]]]) if value >= 0: - resources[key] = value * used_resource_name_mapping.get(resource_name_mapping[key], 1) + resources[key] = value * used_resource_name_mapping.get( + resource_name_mapping[key], 1) if used_value >= 0: - used_resources[key] = used_value * used_resource_name_mapping.get(used_resource_name_mapping[key], 1) + used_resources[key] = used_value * used_resource_name_mapping.get( + used_resource_name_mapping[key], 1) resources[key] = float() # handle memory key = "memory" - resources[key] = (float(row[header[resource_name_mapping[key]]]) * float( - row[header[resource_name_mapping["cores"]]])) * unit_conversion_mapping.get(resource_name_mapping[key], 1) - used_resources[key] = (float(row[header[used_resource_name_mapping[key]]]) * float( - row[header[used_resource_name_mapping["cores"]]])) * unit_conversion_mapping.get(used_resource_name_mapping[key], 1) + resources[key] = \ + (float(row[header[resource_name_mapping[key]]]) + * float(row[header[resource_name_mapping["cores"]]])) \ + * unit_conversion_mapping.get(resource_name_mapping[key], 1) + used_resources[key] = \ + (float(row[header[used_resource_name_mapping[key]]]) + * float(row[header[used_resource_name_mapping["cores"]]])) \ + * unit_conversion_mapping.get(used_resource_name_mapping[key], 1) yield Job( resources=resources, diff --git a/lapis/pool.py b/lapis/pool.py index 6631a52..592b952 100644 --- a/lapis/pool.py +++ b/lapis/pool.py @@ -5,15 +5,17 @@ class Pool(interfaces.Pool): """ - A pool encapsulating a number of pools or drones. Given a specific demand, allocation and utilisation, the - pool is able to adapt in terms of number of drones providing the given resources. + A pool encapsulating a number of pools or drones. Given a specific demand, + allocation and utilisation, the pool is able to adapt in terms of number of + drones providing the given resources. :param capacity: Maximum number of pools that can be instantiated within the pool :param init: Number of pools to instantiate at creation time of the pool :param name: Name of the pool :param make_drone: Callable to create a drone with specific properties for this pool """ - def __init__(self, capacity: float = float('inf'), init: float = 0, name: str = None, make_drone: Callable = None): + def __init__(self, capacity: float = float('inf'), init: float = 0, + name: str = None, make_drone: Callable = None): super(Pool, self).__init__() assert make_drone self.make_drone = make_drone @@ -46,8 +48,9 @@ def init_pool(self, init: float = 0): # TODO: the run method currently needs to be called manually async def run(self): """ - Pool periodically checks the current demand and provided drones. If demand is higher than the current level, - the pool takes care of initialising new drones. Otherwise drones get removed. + Pool periodically checks the current demand and provided drones. + If demand is higher than the current level, the pool takes care of + initialising new drones. Otherwise drones get removed. """ async with Scope() as scope: while True: @@ -126,15 +129,19 @@ def __repr__(self): class StaticPool(Pool): """ - A static pool does not react on changing conditions regarding demand, allocation and utilisation but instead - initialises the `capacity` of given drones with initialised `resources`. - - :param capacity: Maximum number of pools that can be instantiated within the pool - :param resources: Dictionary of resources available for each pool instantiated within the pool + A static pool does not react on changing conditions regarding demand, + allocation and utilisation but instead initialises the `capacity` of given + drones with initialised `resources`. + + :param capacity: Maximum number of pools that can be instantiated within + the pool + :param resources: Dictionary of resources available for each pool + instantiated within the pool """ def __init__(self, capacity: float = 0, make_drone: Callable = None): assert capacity > 0, "Static pool was initialised without any resources..." - super(StaticPool, self).__init__(capacity=capacity, init=capacity, make_drone=make_drone) + super(StaticPool, self).__init__(capacity=capacity, init=capacity, + make_drone=make_drone) self._demand = capacity async def run(self): diff --git a/lapis/pool_io/htcondor.py b/lapis/pool_io/htcondor.py index 05a3cd7..340402f 100644 --- a/lapis/pool_io/htcondor.py +++ b/lapis/pool_io/htcondor.py @@ -15,10 +15,12 @@ def htcondor_pool_reader(iterable, resource_name_mapping: dict = { "TotalSlotMemory": 1.024 / 1024 }, pool_type: Callable = Pool, make_drone: Callable = None): """ - Load a pool configuration that was exported via htcondor from files or iterables + Load a pool configuration that was exported via htcondor from files or + iterables :param iterable: an iterable yielding lines of CSV, such as an open file - :param resource_name_mapping: Mapping from given header names to well-defined resources in simulation + :param resource_name_mapping: Mapping from given header names to well-defined + resources in simulation :param pool_type: The type of pool to be yielded :param make_drone: :return: Yields the :py:class:`Pool`s found in the given iterable @@ -33,6 +35,7 @@ def htcondor_pool_reader(iterable, resource_name_mapping: dict = { capacity = float("Inf") yield pool_type( capacity=capacity, - make_drone=partial(make_drone, {key: float(row[value]) * unit_conversion_mapping.get(value, 1) - for key, value in resource_name_mapping.items()}, - ignore_resources=["disk"])) + make_drone=partial(make_drone, { + key: float(row[value]) * unit_conversion_mapping.get(value, 1) + for key, value in resource_name_mapping.items()}, + ignore_resources=["disk"])) diff --git a/lapis/pool_io/machines.py b/lapis/pool_io/machines.py index 7b20695..d8bc3ed 100644 --- a/lapis/pool_io/machines.py +++ b/lapis/pool_io/machines.py @@ -10,11 +10,13 @@ def machines_pool_reader(iterable, resource_name_mapping: dict = { "memory": "RAM_per_node_in_KB" }, pool_type: Callable = Pool, make_drone: Callable = None): """ - Load a pool configuration that was exported via htcondor from files or iterables + Load a pool configuration that was exported via htcondor from files or + iterables :param make_drone: The callable to create the drone :param iterable: an iterable yielding lines of CSV, such as an open file - :param resource_name_mapping: Mapping from given header names to well-defined resources in simulation + :param resource_name_mapping: Mapping from given header names to well-defined + resources in simulation :param pool_type: The type of pool to be yielded :return: Yields the :py:class:`StaticPool`s found in the given iterable """ @@ -23,6 +25,8 @@ def machines_pool_reader(iterable, resource_name_mapping: dict = { for row_idx, row in enumerate(reader): yield pool_type( capacity=int(row["number_of_nodes"]), - make_drone=partial(make_drone, {key: float(row[value]) for key, value in resource_name_mapping.items()}), + make_drone=partial(make_drone, { + key: float(row[value]) for key, value in + resource_name_mapping.items()}), name=row["cluster_name"] ) diff --git a/lapis/scheduler.py b/lapis/scheduler.py index becf4f0..2c9e283 100644 --- a/lapis/scheduler.py +++ b/lapis/scheduler.py @@ -16,14 +16,18 @@ def job_scheduler(simulator): class CondorJobScheduler(object): """ - Goal of the htcondor job scheduler is to have a scheduler that somehow mimics how htcondor does schedule jobs. - Htcondor does scheduling based on a priority queue. The priorities itself are managed by operators of htcondor. + Goal of the htcondor job scheduler is to have a scheduler that somehow + mimics how htcondor does schedule jobs. + Htcondor does scheduling based on a priority queue. The priorities itself + are managed by operators of htcondor. So different instances can apparently behave very different. - In my case I am going to try building a priority queue that sorts job slots by increasing cost. The cost itself - is calculated based on the current strategy that is used at GridKa. The scheduler checks if a job either - exactly fits a slot or if it does fit into it several times. The cost for putting a job at a given slot is - given by the amount of resources that might remain unallocated. + In my case I am going to try building a priority queue that sorts job slots + by increasing cost. The cost itself is calculated based on the current + strategy that is used at GridKa. The scheduler checks if a job either + exactly fits a slot or if it does fit into it several times. The cost for + putting a job at a given slot is given by the amount of resources that + might remain unallocated. :return: """ @@ -58,9 +62,11 @@ def _add_drone(self, drone: Drone): if len(self.drone_cluster) > 0: for cluster in self.drone_cluster: current_distance = 0 - for key in {*cluster[0].theoretical_available_resources, *drone.theoretical_available_resources}: - current_distance += abs(cluster[0].theoretical_available_resources.get(key, 0) - - drone.theoretical_available_resources.get(key, 0)) + for key in {*cluster[0].theoretical_available_resources, + *drone.theoretical_available_resources}: + current_distance += abs( + cluster[0].theoretical_available_resources.get(key, 0) + - drone.theoretical_available_resources.get(key, 0)) if current_distance < distance: minimum_distance_cluster = cluster distance = current_distance @@ -101,13 +107,15 @@ def _schedule_job(self, job) -> Drone: cost = float("Inf") break elif resource_type not in job.resources: - cost += drone.pool_resources[resource_type] - drone.resources[resource_type] - elif (drone.pool_resources[resource_type] - drone.resources[resource_type]) < \ - job.resources[resource_type]: + cost += drone.pool_resources[resource_type] \ + - drone.resources[resource_type] + elif (drone.pool_resources[resource_type] + - drone.resources[resource_type]) < job.resources[resource_type]: cost = float("Inf") break else: - cost += (drone.pool_resources[resource_type] - drone.resources[resource_type]) // \ + cost += (drone.pool_resources[resource_type] + - drone.resources[resource_type]) // \ job.resources[resource_type] cost /= len(resource_types) if cost <= 1: diff --git a/lapis/simulator.py b/lapis/simulator.py index d76486e..1a89e09 100644 --- a/lapis/simulator.py +++ b/lapis/simulator.py @@ -6,8 +6,9 @@ from lapis.drone import Drone from lapis.job import job_to_queue_scheduler -from lapis.utility.monitor import Monitoring, collect_pool_statistics, collect_user_demand, collect_job_statistics, \ - collect_resource_statistics, collect_cobald_cost +from lapis.utility.monitor import Monitoring, collect_pool_statistics, \ + collect_user_demand, collect_job_statistics, collect_resource_statistics, \ + collect_cobald_cost class Simulator(object): @@ -36,7 +37,8 @@ def create_job_generator(self, job_input, job_reader): def create_pools(self, pool_input, pool_reader, pool_type, controller=None): assert self.job_scheduler, "Scheduler needs to be created before pools" - for pool in pool_reader(iterable=pool_input, pool_type=pool_type, make_drone=partial(Drone, self.job_scheduler)): + for pool in pool_reader(iterable=pool_input, pool_type=pool_type, + make_drone=partial(Drone, self.job_scheduler)): self.pools.append(pool) if controller: self.controllers.append(controller(target=pool, rate=1)) diff --git a/lapis/utility/monitor.py b/lapis/utility/monitor.py index 5ad6a6c..4b34b6e 100644 --- a/lapis/utility/monitor.py +++ b/lapis/utility/monitor.py @@ -41,21 +41,29 @@ def collect_resource_statistics(simulator: "Simulator") -> dict: for resource_key in {*drone.resources, *drone.used_resources}: drone_resources.setdefault(resource_key, {}) try: - drone_resources[resource_key]["reserved"] += drone.resources[resource_key] + drone_resources[resource_key]["reserved"] += \ + drone.resources[resource_key] except KeyError: - drone_resources[resource_key]["reserved"] = drone.resources[resource_key] + drone_resources[resource_key]["reserved"] = \ + drone.resources[resource_key] try: - drone_resources[resource_key]["used"] += drone.used_resources[resource_key] + drone_resources[resource_key]["used"] += \ + drone.used_resources[resource_key] except KeyError: - drone_resources[resource_key]["used"] = drone.used_resources[resource_key] + drone_resources[resource_key]["used"] = \ + drone.used_resources[resource_key] try: - drone_resources[resource_key]["available"] += drone.pool_resources[resource_key] - drone.resources[resource_key] + drone_resources[resource_key]["available"] += \ + drone.pool_resources[resource_key] - drone.resources[resource_key] except KeyError: - drone_resources[resource_key]["available"] = drone.pool_resources[resource_key] - drone.resources[resource_key] + drone_resources[resource_key]["available"] = \ + drone.pool_resources[resource_key] - drone.resources[resource_key] try: - drone_resources[resource_key]["total"] += drone.pool_resources[resource_key] + drone_resources[resource_key]["total"] += \ + drone.pool_resources[resource_key] except KeyError: - drone_resources[resource_key]["total"] = drone.pool_resources[resource_key] + drone_resources[resource_key]["total"] = \ + drone.pool_resources[resource_key] return { "empty_drones": empty_drones, "drone_resources": drone_resources From 3d26b356554b21bd0f47a39b55b50eba7204d876 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Tue, 7 May 2019 14:14:52 +0200 Subject: [PATCH 155/648] added extra packages for flake --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index 34aade3..0293bb8 100644 --- a/setup.py +++ b/setup.py @@ -30,6 +30,7 @@ ], extras_require={ 'docs': ["sphinx", "sphinxcontrib-tikz"], + 'contrib': ['flake8', 'flake8-bugbear'] }, # metadata for package search license='MIT', From 9ae8efe4c15dc1bcc720830fd7a4cb4995696715 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Tue, 7 May 2019 14:15:19 +0200 Subject: [PATCH 156/648] ignored break before binary operator --- .pep8speaks.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.pep8speaks.yml b/.pep8speaks.yml index 65c419a..5a1d249 100644 --- a/.pep8speaks.yml +++ b/.pep8speaks.yml @@ -3,4 +3,6 @@ scanner: linter: flake8 flake8: - max-line-length: 88 # flake8-bugbear uses 80*1.1 \ No newline at end of file + max-line-length: 88 # flake8-bugbear uses 80*1.1 + ignore: + - W503 \ No newline at end of file From 07a81fd55953175118e2a5b25fb3005c5e17e316 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Tue, 7 May 2019 14:16:10 +0200 Subject: [PATCH 157/648] adapted code to work with most recent version of usim --- lapis/drone.py | 8 ++++---- lapis/job.py | 7 ++----- 2 files changed, 6 insertions(+), 9 deletions(-) diff --git a/lapis/drone.py b/lapis/drone.py index 4c7e56b..7103d35 100644 --- a/lapis/drone.py +++ b/lapis/drone.py @@ -1,7 +1,7 @@ import logging from cobald import interfaces -from usim import time, Scope, instant, ActivityState +from usim import time, Scope, instant, TaskState from lapis.job import Job @@ -165,11 +165,11 @@ async def start_job(self, job: Job, kill: bool = False): # check is not relevant if the data is not stored pass self.scheduler.update_drone(self) - if job_execution.status != ActivityState.CANCELLED: + if job_execution.status != TaskState.CANCELLED: self.jobs += 1 await sampling_required.set(True) - await job_execution - if job_execution.status == ActivityState.CANCELLED: + await job_execution.done + if job_execution.status == TaskState.CANCELLED: for resource_key in job_keys: usage = job.used_resources.get(resource_key, None) \ or job.resources.get(resource_key, None) diff --git a/lapis/job.py b/lapis/job.py index 2f5872c..f640806 100644 --- a/lapis/job.py +++ b/lapis/job.py @@ -3,10 +3,7 @@ import logging from usim import time - - -# TODO: needs refactoring -from usim._primitives.activity import CancelActivity +from usim import TaskCancelled def job_demand(simulator): @@ -99,7 +96,7 @@ async def run(self): }) try: await (time + self.walltime) - except CancelActivity: + except TaskCancelled: self._success = False except BaseException: self._success = False From afd7ed1636ab523804b7f951d989eee3925daa02 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Tue, 7 May 2019 14:18:41 +0200 Subject: [PATCH 158/648] made running of flake8 mandatory --- .travis.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index d0c38bc..398bdd2 100644 --- a/.travis.yml +++ b/.travis.yml @@ -16,10 +16,11 @@ before_script: - pip --version - export PYTHONHASHSEED=${PYTHONHASHSEED:-${RANDOM}} - echo "export PYTHONHASHSEED=${PYTHONHASHSEED}" - - pip install . + - pip install .[contrib] - pip install codecov - export COVERAGE_PROCESS_START=$(pwd)/.coveragerc script: + - python -m flake8 - coverage run setup.py test after_success: - coverage report && codecov From 365be44004a63b699ce3eb941e8b8a967e887bbb Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Tue, 7 May 2019 14:34:09 +0200 Subject: [PATCH 159/648] fixed line length warnings --- lapis/__about__.py | 3 ++- lapis/cli/simulate.py | 37 +++++++++++++++++++---------- lapis/controller.py | 22 ++++++++++------- lapis_tests/job_io/test_htcondor.py | 6 +++-- lapis_tests/job_io/test_swf.py | 3 ++- 5 files changed, 47 insertions(+), 24 deletions(-) diff --git a/lapis/__about__.py b/lapis/__about__.py index 931ce92..d7e7ed5 100644 --- a/lapis/__about__.py +++ b/lapis/__about__.py @@ -6,7 +6,8 @@ This is a **draft** for a scheduling simulator utilising opportunistic resources. """ __title__ = 'lapis' -__summary__ = 'Lapis is an adaptable, performant, and interactive scheduling (Lapis) simulator' +__summary__ = 'Lapis is an adaptable, performant, and interactive scheduling ' \ + '(Lapis) simulator' __url__ = 'https://github.com/MaineKuehn/lapis' __version__ = '0.1.0' diff --git a/lapis/cli/simulate.py b/lapis/cli/simulate.py index 4005f6b..4ede919 100644 --- a/lapis/cli/simulate.py +++ b/lapis/cli/simulate.py @@ -44,7 +44,8 @@ def cli(ctx, seed, until, log_tcp, log_file): ctx.obj['seed'] = seed ctx.obj['until'] = until if log_tcp: - socketHandler = JSONSocketHandler('localhost', logging.handlers.DEFAULT_TCP_LOGGING_PORT) + socketHandler = JSONSocketHandler( + 'localhost', logging.handlers.DEFAULT_TCP_LOGGING_PORT) socketHandler.setFormatter(JsonFormatter()) monitoring_logger.addHandler(socketHandler) if log_file: @@ -54,14 +55,17 @@ def cli(ctx, seed, until, log_tcp, log_file): @cli.command() -@click.option("--job-file", "job_file", type=(click.File("r"), click.Choice(list(job_import_mapper.keys())))) -@click.option("--pool-file", "pool_file", type=(click.File("r"), click.Choice(list(pool_import_mapper.keys()))), multiple=True) +@click.option("--job-file", "job_file", type=( + click.File("r"), click.Choice(list(job_import_mapper.keys())))) +@click.option("--pool-file", "pool_file", type=( + click.File("r"), click.Choice(list(pool_import_mapper.keys()))), multiple=True) @click.pass_context def static(ctx, job_file, pool_file): click.echo("starting static environment") simulator = Simulator(seed=ctx.obj["seed"]) file, file_type = job_file - simulator.create_job_generator(job_input=file, job_reader=job_import_mapper[file_type]) + simulator.create_job_generator( + job_input=file, job_reader=job_import_mapper[file_type]) simulator.create_scheduler(scheduler_type=CondorJobScheduler) for current_pool in pool_file: pool_file, pool_file_type = current_pool @@ -73,14 +77,17 @@ def static(ctx, job_file, pool_file): @cli.command() -@click.option("--job-file", "job_file", type=(click.File("r"), click.Choice(list(job_import_mapper.keys())))) -@click.option("--pool-file", "pool_file", type=(click.File("r"), click.Choice(list(pool_import_mapper.keys()))), multiple=True) +@click.option("--job-file", "job_file", type=( + click.File("r"), click.Choice(list(job_import_mapper.keys())))) +@click.option("--pool-file", "pool_file", type=( + click.File("r"), click.Choice(list(pool_import_mapper.keys()))), multiple=True) @click.pass_context def dynamic(ctx, job_file, pool_file): click.echo("starting dynamic environment") simulator = Simulator(seed=ctx.obj["seed"]) file, file_type = job_file - simulator.create_job_generator(job_input=file, job_reader=job_import_mapper[file_type]) + simulator.create_job_generator( + job_input=file, job_reader=job_import_mapper[file_type]) simulator.create_scheduler(scheduler_type=CondorJobScheduler) for current_pool in pool_file: file, file_type = current_pool @@ -93,19 +100,25 @@ def dynamic(ctx, job_file, pool_file): @cli.command() -@click.option("--job-file", "job_file", type=(click.File("r"), click.Choice(list(job_import_mapper.keys())))) -@click.option("--static-pool-file", "static_pool_file", type=(click.File("r"), click.Choice(list(pool_import_mapper.keys()))), multiple=True) -@click.option("--dynamic-pool-file", "dynamic_pool_file", type=(click.File("r"), click.Choice(list(pool_import_mapper.keys()))), multiple=True) +@click.option("--job-file", "job_file", type=( + click.File("r"), click.Choice(list(job_import_mapper.keys())))) +@click.option("--static-pool-file", "static_pool_file", type=( + click.File("r"), click.Choice(list(pool_import_mapper.keys()))), multiple=True) +@click.option("--dynamic-pool-file", "dynamic_pool_file", type=( + click.File("r"), click.Choice(list(pool_import_mapper.keys()))), multiple=True) @click.pass_context def hybrid(ctx, job_file, static_pool_file, dynamic_pool_file): click.echo("starting hybrid environment") simulator = Simulator(seed=ctx.obj["seed"]) file, file_type = job_file - simulator.create_job_generator(job_input=file, job_reader=job_import_mapper[file_type]) + simulator.create_job_generator( + job_input=file, job_reader=job_import_mapper[file_type]) simulator.create_scheduler(scheduler_type=CondorJobScheduler) for current_pool in static_pool_file: file, file_type = current_pool - simulator.create_pools(pool_input=file, pool_reader=pool_import_mapper[file_type], pool_type=StaticPool) + simulator.create_pools( + pool_input=file, pool_reader=pool_import_mapper[file_type], + pool_type=StaticPool) for current_pool in dynamic_pool_file: file, file_type = current_pool simulator.create_pools( diff --git a/lapis/controller.py b/lapis/controller.py index aba6fb7..32681bf 100644 --- a/lapis/controller.py +++ b/lapis/controller.py @@ -5,8 +5,10 @@ class SimulatedLinearController(LinearController): - def __init__(self, target: Pool, low_utilisation=0.5, high_allocation=0.5, rate=1, interval=1): - super(SimulatedLinearController, self).__init__(target, low_utilisation, high_allocation, rate, interval) + def __init__(self, target: Pool, low_utilisation=0.5, high_allocation=0.5, + rate=1, interval=1): + super(SimulatedLinearController, self).__init__( + target, low_utilisation, high_allocation, rate, interval) async def run(self): while True: @@ -15,11 +17,13 @@ async def run(self): class SimulatedRelativeSupplyController(RelativeSupplyController): - def __init__(self, target: Pool, low_utilisation=0.5, high_allocation=0.5, low_scale=0.9, high_scale=1.1, + def __init__(self, target: Pool, low_utilisation=0.5, high_allocation=0.5, + low_scale=0.9, high_scale=1.1, interval=1): - super(SimulatedRelativeSupplyController, self).__init__(target=target, low_utilisation=low_utilisation, - high_allocation=high_allocation, low_scale=low_scale, - high_scale=high_scale, interval=interval) + super(SimulatedRelativeSupplyController, self).__init__( + target=target, low_utilisation=low_utilisation, + high_allocation=high_allocation, low_scale=low_scale, + high_scale=high_scale, interval=interval) async def run(self): while True: @@ -28,9 +32,11 @@ async def run(self): class SimulatedCostController(SimulatedLinearController): - def __init__(self, target: Pool, low_utilisation=0.5, high_allocation=0.5, rate=1, interval=1): + def __init__(self, target: Pool, low_utilisation=0.5, high_allocation=0.5, + rate=1, interval=1): self.current_cost = 1 - super(SimulatedCostController, self).__init__(target, low_utilisation, high_allocation, rate, interval) + super(SimulatedCostController, self).__init__( + target, low_utilisation, high_allocation, rate, interval) def regulate(self, interval): allocation = 0 diff --git a/lapis_tests/job_io/test_htcondor.py b/lapis_tests/job_io/test_htcondor.py index fd533e7..c6ed66b 100644 --- a/lapis_tests/job_io/test_htcondor.py +++ b/lapis_tests/job_io/test_htcondor.py @@ -5,13 +5,15 @@ class TestHtcondorJobReader(object): def test_simple_read(self): - with open(os.path.join(os.path.dirname(__file__), "..", "data", "htcondor_jobs.csv")) as input_file: + with open(os.path.join(os.path.dirname(__file__), "..", "data", + "htcondor_jobs.csv")) as input_file: jobs = 0 for job in htcondor_job_reader(input_file): assert job is not None jobs += 1 assert jobs > 0 - with open(os.path.join(os.path.dirname(__file__), "..", "data", "htcondor_jobs.csv")) as input_file: + with open(os.path.join(os.path.dirname(__file__), "..", "data", + "htcondor_jobs.csv")) as input_file: # ensure that one job was removed by importer (wrong walltime given) lines = sum(1 for _ in input_file) assert jobs == (lines - 2) diff --git a/lapis_tests/job_io/test_swf.py b/lapis_tests/job_io/test_swf.py index 3bb861f..04bca06 100644 --- a/lapis_tests/job_io/test_swf.py +++ b/lapis_tests/job_io/test_swf.py @@ -4,7 +4,8 @@ class TestSwfJobReader(object): def test_simple_read(self): - with open(os.path.join(os.path.dirname(__file__), "..", "data", "swf_jobs.swf")) as input_file: + with open(os.path.join(os.path.dirname(__file__), "..", "data", + "swf_jobs.swf")) as input_file: job_count = 0 for job in swf_job_reader(input_file): assert job is not None From d63fad2f0f12754f2a36b69d73e3d416dfef1518 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Tue, 7 May 2019 14:36:17 +0200 Subject: [PATCH 160/648] ignored warning about mutable datastructures for argument defaults --- lapis/job_io/htcondor.py | 6 +++--- lapis/job_io/swf.py | 6 +++--- lapis/pool_io/htcondor.py | 4 ++-- lapis/pool_io/machines.py | 2 +- 4 files changed, 9 insertions(+), 9 deletions(-) diff --git a/lapis/job_io/htcondor.py b/lapis/job_io/htcondor.py index a7b2dcf..1115f53 100644 --- a/lapis/job_io/htcondor.py +++ b/lapis/job_io/htcondor.py @@ -4,18 +4,18 @@ from lapis.job import Job -def htcondor_job_reader(iterable, resource_name_mapping={ +def htcondor_job_reader(iterable, resource_name_mapping={ # noqa: B006 "cores": "RequestCpus", "walltime": "RequestWalltime", # s "memory": "RequestMemory", # MiB "disk": "RequestDisk" # KiB -}, used_resource_name_mapping={ +}, used_resource_name_mapping={ # noqa: B006 "queuetime": "QDate", "walltime": "RemoteWallClockTime", # s "cores": "Number of Allocated Processors", "memory": "MemoryUsage", # MB "disk": "DiskUsage_RAW" # KiB -}, unit_conversion_mapping={ +}, unit_conversion_mapping={ # noqa: B006 "RequestCpus": 1, "RequestWalltime": 1, "RequestMemory": 1.024 / 1024, diff --git a/lapis/job_io/swf.py b/lapis/job_io/swf.py index 1fad6eb..4bb5024 100644 --- a/lapis/job_io/swf.py +++ b/lapis/job_io/swf.py @@ -8,16 +8,16 @@ from lapis.job import Job -def swf_job_reader(iterable, resource_name_mapping={ +def swf_job_reader(iterable, resource_name_mapping={ # noqa: B006 "cores": "Requested Number of Processors", "walltime": "Requested Time", "memory": "Requested Memory" -}, used_resource_name_mapping={ +}, used_resource_name_mapping={ # noqa: B006 "walltime": "Run Time", "cores": "Number of Allocated Processors", "memory": "Used Memory", "queuetime": "Submit Time" -}, unit_conversion_mapping={ +}, unit_conversion_mapping={ # noqa: B006 "Used Memory": 1 / 1024 / 1024, "Requested Memory": 1 / 2114 / 1024 }): diff --git a/lapis/pool_io/htcondor.py b/lapis/pool_io/htcondor.py index 340402f..62fe001 100644 --- a/lapis/pool_io/htcondor.py +++ b/lapis/pool_io/htcondor.py @@ -5,11 +5,11 @@ from ..pool import Pool -def htcondor_pool_reader(iterable, resource_name_mapping: dict = { +def htcondor_pool_reader(iterable, resource_name_mapping: dict = { # noqa: B006 "cores": "TotalSlotCPUs", "disk": "TotalSlotDisk", # MiB "memory": "TotalSlotMemory" # MiB -}, unit_conversion_mapping: dict = { +}, unit_conversion_mapping: dict = { # noqa: B006 "TotalSlotCPUs": 1, "TotalSlotDisk": 1.024 / 1024, "TotalSlotMemory": 1.024 / 1024 diff --git a/lapis/pool_io/machines.py b/lapis/pool_io/machines.py index d8bc3ed..7f5b32e 100644 --- a/lapis/pool_io/machines.py +++ b/lapis/pool_io/machines.py @@ -5,7 +5,7 @@ from ..pool import Pool -def machines_pool_reader(iterable, resource_name_mapping: dict = { +def machines_pool_reader(iterable, resource_name_mapping: dict = { # noqa: B006 "cores": "CPUs_per_node", "memory": "RAM_per_node_in_KB" }, pool_type: Callable = Pool, make_drone: Callable = None): From 20755ccb6d607489af96f875b1940a4c64b5b807 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Tue, 7 May 2019 14:36:45 +0200 Subject: [PATCH 161/648] iteration does not use enumerate --- lapis/pool_io/htcondor.py | 2 +- lapis/pool_io/machines.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/lapis/pool_io/htcondor.py b/lapis/pool_io/htcondor.py index 62fe001..b491ccc 100644 --- a/lapis/pool_io/htcondor.py +++ b/lapis/pool_io/htcondor.py @@ -27,7 +27,7 @@ def htcondor_pool_reader(iterable, resource_name_mapping: dict = { # noqa: B006 """ assert make_drone reader = csv.DictReader(iterable, delimiter=' ', skipinitialspace=True) - for row_idx, row in enumerate(reader): + for row in reader: try: capacity = int(row["Count"]) except ValueError: diff --git a/lapis/pool_io/machines.py b/lapis/pool_io/machines.py index 7f5b32e..c2c55a5 100644 --- a/lapis/pool_io/machines.py +++ b/lapis/pool_io/machines.py @@ -22,7 +22,7 @@ def machines_pool_reader(iterable, resource_name_mapping: dict = { # noqa: B006 """ assert make_drone reader = csv.DictReader(iterable, delimiter=' ', skipinitialspace=True) - for row_idx, row in enumerate(reader): + for row in reader: yield pool_type( capacity=int(row["number_of_nodes"]), make_drone=partial(make_drone, { From 5fc89ccd30bc0ae443148f31e318443f8b37e3d1 Mon Sep 17 00:00:00 2001 From: Max Fischer Date: Tue, 7 May 2019 15:49:56 +0200 Subject: [PATCH 162/648] Update .pep8speaks.yml Co-Authored-By: eileen-kuehn --- .pep8speaks.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pep8speaks.yml b/.pep8speaks.yml index 5a1d249..e538818 100644 --- a/.pep8speaks.yml +++ b/.pep8speaks.yml @@ -5,4 +5,4 @@ scanner: flake8: max-line-length: 88 # flake8-bugbear uses 80*1.1 ignore: - - W503 \ No newline at end of file + - W503 From 014c329c5aa0df800036e431efdbd759ef0d0394 Mon Sep 17 00:00:00 2001 From: Max Fischer Date: Tue, 7 May 2019 15:52:36 +0200 Subject: [PATCH 163/648] Apply suggestions from code review Co-Authored-By: eileen-kuehn --- lapis/drone.py | 4 ++-- lapis/job.py | 2 +- lapis/pool.py | 4 ++-- lapis/pool_io/htcondor.py | 2 +- lapis/pool_io/machines.py | 2 +- setup.cfg | 2 +- 6 files changed, 8 insertions(+), 8 deletions(-) diff --git a/lapis/drone.py b/lapis/drone.py index 7103d35..521bf82 100644 --- a/lapis/drone.py +++ b/lapis/drone.py @@ -19,7 +19,7 @@ def __init__(self, scheduler, pool_resources: dict, :param pool_resources: :param scheduling_duration: :param exclusive: Determines if the drone is used exclusively by jobs - in sequential order + in sequential order """ super(Drone, self).__init__() self.scheduler = scheduler @@ -129,7 +129,7 @@ async def start_job(self, job: Job, kill: bool = False): :param job: the job to start :param kill: if True, a job is killed when used resources exceed - requested resources + requested resources :return: """ # TODO: ensure that jobs cannot immediately started on the same drone diff --git a/lapis/job.py b/lapis/job.py index f640806..96d44a8 100644 --- a/lapis/job.py +++ b/lapis/job.py @@ -49,7 +49,7 @@ def __init__(self, resources: dict, used_resources: dict, in_queue_since: float :param resources: Requested resources of the job :param used_resources: Resource usage of the job :param in_queue_since: Time when job was inserted into the queue of the - simulation scheduler + simulation scheduler :param queue_date: Time when job was inserted into queue in real life :param name: Name of the job """ diff --git a/lapis/pool.py b/lapis/pool.py index 592b952..0d3d931 100644 --- a/lapis/pool.py +++ b/lapis/pool.py @@ -134,9 +134,9 @@ class StaticPool(Pool): drones with initialised `resources`. :param capacity: Maximum number of pools that can be instantiated within - the pool + the pool :param resources: Dictionary of resources available for each pool - instantiated within the pool + instantiated within the pool """ def __init__(self, capacity: float = 0, make_drone: Callable = None): assert capacity > 0, "Static pool was initialised without any resources..." diff --git a/lapis/pool_io/htcondor.py b/lapis/pool_io/htcondor.py index b491ccc..0dd3e90 100644 --- a/lapis/pool_io/htcondor.py +++ b/lapis/pool_io/htcondor.py @@ -20,7 +20,7 @@ def htcondor_pool_reader(iterable, resource_name_mapping: dict = { # noqa: B006 :param iterable: an iterable yielding lines of CSV, such as an open file :param resource_name_mapping: Mapping from given header names to well-defined - resources in simulation + resources in simulation :param pool_type: The type of pool to be yielded :param make_drone: :return: Yields the :py:class:`Pool`s found in the given iterable diff --git a/lapis/pool_io/machines.py b/lapis/pool_io/machines.py index c2c55a5..d7272ac 100644 --- a/lapis/pool_io/machines.py +++ b/lapis/pool_io/machines.py @@ -16,7 +16,7 @@ def machines_pool_reader(iterable, resource_name_mapping: dict = { # noqa: B006 :param make_drone: The callable to create the drone :param iterable: an iterable yielding lines of CSV, such as an open file :param resource_name_mapping: Mapping from given header names to well-defined - resources in simulation + resources in simulation :param pool_type: The type of pool to be yielded :return: Yields the :py:class:`StaticPool`s found in the given iterable """ diff --git a/setup.cfg b/setup.cfg index 09dbcbb..89faa06 100644 --- a/setup.cfg +++ b/setup.cfg @@ -6,4 +6,4 @@ statistics = True max-line-length = 80 ignore = E501, B008, B011, W503 select = C,E,F,W,B,B9 -exclude = docs,.svn,CVS,.bzr,.hg,.git,__pycache__,.tox,.eggs,*.egg \ No newline at end of file +exclude = docs,.svn,CVS,.bzr,.hg,.git,__pycache__,.tox,.eggs,*.egg From d5a2d37d98e469820c90d61c265dadffcd81be92 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Tue, 7 May 2019 15:55:14 +0200 Subject: [PATCH 164/648] resolved comments from pull request --- lapis/drone.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lapis/drone.py b/lapis/drone.py index 521bf82..07f5e46 100644 --- a/lapis/drone.py +++ b/lapis/drone.py @@ -28,7 +28,8 @@ def __init__(self, scheduler, pool_resources: dict, if ignore_resources: self._valid_resource_keys = [ resource for resource in self.pool_resources - if resource not in ignore_resources] + if resource not in ignore_resources + ] else: self._valid_resource_keys = self.pool_resources.keys() # shadowing requested resources to determine jobs to be killed From e79da8b671db6ddd199e1a55c53f2531e3295458 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Mon, 24 Jun 2019 18:47:56 +0200 Subject: [PATCH 165/648] fixed conversion of units for swf job import to work correctly --- lapis/job_io/swf.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/lapis/job_io/swf.py b/lapis/job_io/swf.py index 4bb5024..0b0eb01 100644 --- a/lapis/job_io/swf.py +++ b/lapis/job_io/swf.py @@ -50,12 +50,11 @@ def swf_job_reader(iterable, resource_name_mapping={ # noqa: B006 value = float(row[header[resource_name_mapping[key]]]) used_value = float(row[header[used_resource_name_mapping[key]]]) if value >= 0: - resources[key] = value * used_resource_name_mapping.get( + resources[key] = value * unit_conversion_mapping.get( resource_name_mapping[key], 1) if used_value >= 0: - used_resources[key] = used_value * used_resource_name_mapping.get( + used_resources[key] = used_value * unit_conversion_mapping.get( used_resource_name_mapping[key], 1) - resources[key] = float() # handle memory key = "memory" resources[key] = \ From 699c729751dc1ce54b4588a791832d1ad2abf7f0 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Wed, 26 Jun 2019 22:54:14 +0200 Subject: [PATCH 166/648] added argument to start logging to telegraf via default udp logging port --- lapis/cli/simulate.py | 28 +++++++++++++++++++++++++--- 1 file changed, 25 insertions(+), 3 deletions(-) diff --git a/lapis/cli/simulate.py b/lapis/cli/simulate.py index 4ede919..34c2339 100644 --- a/lapis/cli/simulate.py +++ b/lapis/cli/simulate.py @@ -2,6 +2,7 @@ import logging.handlers from cobald.monitor.format_json import JsonFormatter +from cobald.monitor.format_line import LineProtocolFormatter from lapis.controller import SimulatedLinearController from lapis.job_io.htcondor import htcondor_job_reader @@ -12,14 +13,29 @@ from lapis.scheduler import CondorJobScheduler from lapis.simulator import Simulator +from usim import time -class JSONSocketHandler(logging.handlers.SocketHandler): + +class LoggingSocketHandler(logging.handlers.SocketHandler): def makePickle(self, record): return self.format(record).encode() +class LoggingUDPSocketHandler(logging.handlers.DatagramHandler): + def makePickle(self, record): + return self.format(record).encode() + + +class TimeFilter(logging.Filter): + def filter(self, record): + record.created = time.now + return True + + monitoring_logger = logging.getLogger() monitoring_logger.setLevel(logging.DEBUG) +time_filter = TimeFilter() +monitoring_logger.addFilter(time_filter) last_step = 0 @@ -38,13 +54,14 @@ def makePickle(self, record): @click.option("--until", type=float) @click.option("--log-tcp", "log_tcp", is_flag=True) @click.option("--log-file", "log_file", type=click.File("w")) +@click.option("--log-telegraf", "log_telegraf", is_flag=True) @click.pass_context -def cli(ctx, seed, until, log_tcp, log_file): +def cli(ctx, seed, until, log_tcp, log_file, log_telegraf): ctx.ensure_object(dict) ctx.obj['seed'] = seed ctx.obj['until'] = until if log_tcp: - socketHandler = JSONSocketHandler( + socketHandler = LoggingSocketHandler( 'localhost', logging.handlers.DEFAULT_TCP_LOGGING_PORT) socketHandler.setFormatter(JsonFormatter()) monitoring_logger.addHandler(socketHandler) @@ -52,6 +69,11 @@ def cli(ctx, seed, until, log_tcp, log_file): streamHandler = logging.StreamHandler(stream=log_file) streamHandler.setFormatter(JsonFormatter()) monitoring_logger.addHandler(streamHandler) + if log_telegraf: + telegrafHandler = LoggingUDPSocketHandler( + "localhost", logging.handlers.DEFAULT_UDP_LOGGING_PORT) + telegrafHandler.setFormatter(LineProtocolFormatter(resolution=1)) + monitoring_logger.addHandler(telegrafHandler) @cli.command() From 3ca3877e2598ae82f03af6688a14316a02ac27ee Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Wed, 26 Jun 2019 22:56:30 +0200 Subject: [PATCH 167/648] added context as message instead of simulation time when logging --- lapis/drone.py | 2 +- lapis/job.py | 8 ++++---- lapis/utility/monitor.py | 13 ++++++------- 3 files changed, 11 insertions(+), 12 deletions(-) diff --git a/lapis/drone.py b/lapis/drone.py index 07f5e46..f8ab250 100644 --- a/lapis/drone.py +++ b/lapis/drone.py @@ -177,7 +177,7 @@ async def start_job(self, job: Job, kill: bool = False): value = usage / (job.resources.get(resource_key, None) or self.pool_resources[resource_key]) if value > 1: - logging.info(str(round(time.now)), { + logging.info("job_status", { "job_exceeds_%s" % resource_key: { repr(job): value } diff --git a/lapis/job.py b/lapis/job.py index 96d44a8..91c2084 100644 --- a/lapis/job.py +++ b/lapis/job.py @@ -31,7 +31,7 @@ def job_demand(simulator): value = round(value) if value > 0: simulator.global_demand.put(value) - logging.info(str(round(simulator.env.now)), {"user_demand_new": value}) + logging.info("user_demand", {"user_demand_new": value}) class Job(object): @@ -87,7 +87,7 @@ def waiting_time(self) -> float: async def run(self): self.in_queue_until = time.now - logging.info(str(round(time.now)), { + logging.info("job_status", { "job_queue_time": { repr(self): self.queue_date }, "job_waiting_time": { @@ -102,7 +102,7 @@ async def run(self): self._success = False raise else: - logging.info(str(round(time.now)), { + logging.info("job_status", { "job_wall_time": { repr(self): self.walltime } @@ -133,6 +133,6 @@ async def job_to_queue_scheduler(job_generator, job_queue, **kwargs): job = None else: if count > 0: - logging.info(str(round(time.now)), {"user_demand_new": count}) + logging.info("user_demand", {"user_demand_new": count}) count = 0 await (time == current_time) diff --git a/lapis/utility/monitor.py b/lapis/utility/monitor.py index 4b34b6e..c2f0c9b 100644 --- a/lapis/utility/monitor.py +++ b/lapis/utility/monitor.py @@ -2,7 +2,7 @@ import logging -from usim import each, Flag, time +from usim import each, Flag from lapis.cost import cobald_cost @@ -22,14 +22,13 @@ async def run(self): async for _ in each(delay=1): await sampling_required await sampling_required.set(False) - result = {} - for statistic in self._statistics: + for name, statistic in self._statistics: # do the logging - result.update(statistic(self.simulator)) - logging.info(str(round(time.now)), result) + logging.info(name, statistic(self.simulator)) - def register_statistic(self, statistic: Callable): - self._statistics.append(statistic) + def register_statistic(self, statistic: Callable, name: str = "lapis_data"): + assert name is not None + self._statistics.append((name, statistic)) def collect_resource_statistics(simulator: "Simulator") -> dict: From 6dd6d7871eb41fb97392b8fd65730cd83edab559 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Thu, 27 Jun 2019 11:50:09 +0200 Subject: [PATCH 168/648] moved TimeFilter into utility module --- lapis/cli/simulate.py | 8 +------- lapis/utility/monitor.py | 8 +++++++- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/lapis/cli/simulate.py b/lapis/cli/simulate.py index 34c2339..31a5964 100644 --- a/lapis/cli/simulate.py +++ b/lapis/cli/simulate.py @@ -13,7 +13,7 @@ from lapis.scheduler import CondorJobScheduler from lapis.simulator import Simulator -from usim import time +from lapis.utility.monitor import TimeFilter class LoggingSocketHandler(logging.handlers.SocketHandler): @@ -26,12 +26,6 @@ def makePickle(self, record): return self.format(record).encode() -class TimeFilter(logging.Filter): - def filter(self, record): - record.created = time.now - return True - - monitoring_logger = logging.getLogger() monitoring_logger.setLevel(logging.DEBUG) time_filter = TimeFilter() diff --git a/lapis/utility/monitor.py b/lapis/utility/monitor.py index c2f0c9b..53e9a8f 100644 --- a/lapis/utility/monitor.py +++ b/lapis/utility/monitor.py @@ -2,7 +2,7 @@ import logging -from usim import each, Flag +from usim import each, Flag, time from lapis.cost import cobald_cost @@ -12,6 +12,12 @@ sampling_required = Flag() +class TimeFilter(logging.Filter): + def filter(self, record): + record.created = time.now + return True + + class Monitoring(object): # TODO: we need to check how to integrate the normalization factor def __init__(self, simulator: "Simulator"): From 16c97d7ae1a17e42c178ad7b659277795971a01a Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Thu, 27 Jun 2019 14:06:52 +0200 Subject: [PATCH 169/648] added docstring for class timefilter --- lapis/utility/monitor.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/lapis/utility/monitor.py b/lapis/utility/monitor.py index 53e9a8f..134a3a3 100644 --- a/lapis/utility/monitor.py +++ b/lapis/utility/monitor.py @@ -13,7 +13,11 @@ class TimeFilter(logging.Filter): - def filter(self, record): + """ + py:class:`TimeFilter` takes care to modify the created timestamp of a log + record to be set to the current simulation time. + """ + def filter(self, record) -> bool: record.created = time.now return True From c2484ac69b984f2b87d299137a630d6b9364b1a0 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Thu, 27 Jun 2019 17:14:44 +0200 Subject: [PATCH 170/648] moved tcp and udp logger to utility monitor module --- lapis/cli/simulate.py | 23 ++++++----------------- lapis/utility/monitor.py | 12 ++++++++++++ 2 files changed, 18 insertions(+), 17 deletions(-) diff --git a/lapis/cli/simulate.py b/lapis/cli/simulate.py index 31a5964..020b3a5 100644 --- a/lapis/cli/simulate.py +++ b/lapis/cli/simulate.py @@ -13,23 +13,8 @@ from lapis.scheduler import CondorJobScheduler from lapis.simulator import Simulator -from lapis.utility.monitor import TimeFilter - - -class LoggingSocketHandler(logging.handlers.SocketHandler): - def makePickle(self, record): - return self.format(record).encode() - - -class LoggingUDPSocketHandler(logging.handlers.DatagramHandler): - def makePickle(self, record): - return self.format(record).encode() - - -monitoring_logger = logging.getLogger() -monitoring_logger.setLevel(logging.DEBUG) -time_filter = TimeFilter() -monitoring_logger.addFilter(time_filter) +from lapis.utility.monitor import TimeFilter, LoggingSocketHandler, \ + LoggingUDPSocketHandler last_step = 0 @@ -54,6 +39,10 @@ def cli(ctx, seed, until, log_tcp, log_file, log_telegraf): ctx.ensure_object(dict) ctx.obj['seed'] = seed ctx.obj['until'] = until + monitoring_logger = logging.getLogger() + monitoring_logger.setLevel(logging.DEBUG) + time_filter = TimeFilter() + monitoring_logger.addFilter(time_filter) if log_tcp: socketHandler = LoggingSocketHandler( 'localhost', logging.handlers.DEFAULT_TCP_LOGGING_PORT) diff --git a/lapis/utility/monitor.py b/lapis/utility/monitor.py index 134a3a3..41b2da9 100644 --- a/lapis/utility/monitor.py +++ b/lapis/utility/monitor.py @@ -2,6 +2,8 @@ import logging +from cobald.monitor.format_json import JsonFormatter +from cobald.monitor.format_line import LineProtocolFormatter from usim import each, Flag, time from lapis.cost import cobald_cost @@ -12,6 +14,16 @@ sampling_required = Flag() +class LoggingSocketHandler(logging.handlers.SocketHandler): + def makePickle(self, record): + return self.format(record).encode() + + +class LoggingUDPSocketHandler(logging.handlers.DatagramHandler): + def makePickle(self, record): + return self.format(record).encode() + + class TimeFilter(logging.Filter): """ py:class:`TimeFilter` takes care to modify the created timestamp of a log From d48eb86fb4c4ec4b075a717f2714798ff13fcbb2 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Thu, 27 Jun 2019 17:16:06 +0200 Subject: [PATCH 171/648] first implementation of logging to fit the new database scheme --- lapis/simulator.py | 10 +- lapis/utility/monitor.py | 194 +++++++++++++++++++++++++-------------- 2 files changed, 130 insertions(+), 74 deletions(-) diff --git a/lapis/simulator.py b/lapis/simulator.py index 1a89e09..a1827cd 100644 --- a/lapis/simulator.py +++ b/lapis/simulator.py @@ -6,8 +6,8 @@ from lapis.drone import Drone from lapis.job import job_to_queue_scheduler -from lapis.utility.monitor import Monitoring, collect_pool_statistics, \ - collect_user_demand, collect_job_statistics, collect_resource_statistics, \ +from lapis.utility.monitor import Monitoring, collect_pool_cobald_statistics, \ + collect_user_demand, collect_job_statistics, collect_drone_cobald_statistics, \ collect_cobald_cost @@ -26,11 +26,11 @@ def __init__(self, seed=1234): def enable_monitoring(self): self.monitoring = Monitoring(self) - self.monitoring.register_statistic(collect_pool_statistics) + self.monitoring.register_statistic(collect_pool_cobald_statistics) self.monitoring.register_statistic(collect_user_demand) self.monitoring.register_statistic(collect_job_statistics) - self.monitoring.register_statistic(collect_resource_statistics) - self.monitoring.register_statistic(collect_cobald_cost) + self.monitoring.register_statistic(collect_drone_cobald_statistics) + # self.monitoring.register_statistic(collect_cobald_cost) def create_job_generator(self, job_input, job_reader): self._job_generators.append((job_input, job_reader)) diff --git a/lapis/utility/monitor.py b/lapis/utility/monitor.py index 41b2da9..abdbf7d 100644 --- a/lapis/utility/monitor.py +++ b/lapis/utility/monitor.py @@ -1,3 +1,4 @@ +import copy from typing import Callable, TYPE_CHECKING import logging @@ -35,7 +36,6 @@ def filter(self, record) -> bool: class Monitoring(object): - # TODO: we need to check how to integrate the normalization factor def __init__(self, simulator: "Simulator"): self.simulator = simulator self._statistics = [] @@ -44,51 +44,52 @@ async def run(self): async for _ in each(delay=1): await sampling_required await sampling_required.set(False) - for name, statistic in self._statistics: + for statistic in self._statistics: # do the logging - logging.info(name, statistic(self.simulator)) - - def register_statistic(self, statistic: Callable, name: str = "lapis_data"): - assert name is not None - self._statistics.append((name, statistic)) - - -def collect_resource_statistics(simulator: "Simulator") -> dict: - empty_drones = 0 - drone_resources = {} + for record in statistic(self.simulator): + logging.getLogger(statistic.name).info( + statistic.name, record + ) + + def register_statistic(self, statistic: Callable): + assert hasattr(statistic, "name") and hasattr(statistic, "logging_formatter") + self._statistics.append(statistic) + + # prepare the logger + logger = logging.getLogger(statistic.name) + if len(logger.handlers) == 0: + # append handlers of default logger and add required formatters + root_logger = logging.getLogger() + for handler in root_logger.handlers: + new_handler = copy.copy(handler) + new_handler.setFormatter(statistic.logging_formatter.get( + type(handler).__name__, JsonFormatter())) + + +def collect_resource_statistics(simulator: "Simulator") -> list: + results = [] for drone in simulator.job_scheduler.drone_list: - if drone.allocation == 0: - empty_drones += 1 - for resource_key in {*drone.resources, *drone.used_resources}: - drone_resources.setdefault(resource_key, {}) - try: - drone_resources[resource_key]["reserved"] += \ - drone.resources[resource_key] - except KeyError: - drone_resources[resource_key]["reserved"] = \ - drone.resources[resource_key] - try: - drone_resources[resource_key]["used"] += \ - drone.used_resources[resource_key] - except KeyError: - drone_resources[resource_key]["used"] = \ - drone.used_resources[resource_key] - try: - drone_resources[resource_key]["available"] += \ - drone.pool_resources[resource_key] - drone.resources[resource_key] - except KeyError: - drone_resources[resource_key]["available"] = \ - drone.pool_resources[resource_key] - drone.resources[resource_key] - try: - drone_resources[resource_key]["total"] += \ - drone.pool_resources[resource_key] - except KeyError: - drone_resources[resource_key]["total"] = \ - drone.pool_resources[resource_key] - return { - "empty_drones": empty_drones, - "drone_resources": drone_resources - } + for resource_type in {*drone.resources, *drone.used_resources}: + results.append({ + "resource_type": resource_type, + "pool_configuration": None, + "pool_type": "drone", + "pool": repr(drone), + "used_ratio": drone.used_resources.get(resource_type, 0) / + drone.resources.get(resource_type, 0) + }) + return results + + +collect_resource_statistics.logging_formatter = { + LoggingSocketHandler.__class__.__name__: JsonFormatter(), + logging.StreamHandler.__class__.__name__: JsonFormatter(), + LoggingUDPSocketHandler.__class__.__name__: LineProtocolFormatter( + tags={"tardis", "resource_type", "pool_configuration", "pool_type"}, + resolution=1 + ) +} +collect_resource_statistics.name = "resource_status" def collect_cobald_cost(simulator: "Simulator") -> dict: @@ -102,36 +103,91 @@ def collect_cobald_cost(simulator: "Simulator") -> dict: } -def collect_user_demand(simulator: "Simulator") -> dict: - return { - "user_demand": len(simulator.job_scheduler.job_queue) - } +def collect_user_demand(simulator: "Simulator") -> list: + return [{ + "value": len(simulator.job_scheduler.job_queue) + }] + + +collect_user_demand.logging_formatter = { + LoggingSocketHandler.__class__.__name__: JsonFormatter(), + logging.StreamHandler.__class__.__name__: JsonFormatter(), + LoggingUDPSocketHandler.__class__.__name__: LineProtocolFormatter( + resolution=1 + ) +} +collect_user_demand.name = "user_demand" -def collect_job_statistics(simulator: "Simulator") -> dict: +def collect_job_statistics(simulator: "Simulator") -> list: result = 0 for drone in simulator.job_scheduler.drone_list: result += drone.jobs - return { - "running_jobs": result - } + return [{ + "job_count": result + }] + +collect_job_statistics.logging_formatter = { + LoggingSocketHandler.__class__.__name__: JsonFormatter(), + logging.StreamHandler.__class__.__name__: JsonFormatter(), + LoggingUDPSocketHandler.__class__.__name__: LineProtocolFormatter( + tags={"tardis", "pool_configuration", "pool_type"}, + resolution=1 + ) +} +collect_job_statistics.name = "cobald_status" -def collect_pool_statistics(simulator: "Simulator") -> dict: - pool_demand = {} - pool_supply = {} - pool_utilisation = {} - pool_allocation = {} + +def collect_drone_cobald_statistics(simulator: "Simulator") -> list: + results = [] + for drone in simulator.job_scheduler.drone_list: + results.append({ + "pool_configuration": None, + "pool_type": "drone", + "pool": repr(drone), + "allocation": drone.allocation, + "utilisation": drone.utilisation, + "demand": drone.demand, + "supply": drone.supply, + "job_count": drone.jobs + }) + return results + + +collect_drone_cobald_statistics.logging_formatter = { + LoggingSocketHandler.__class__.__name__: JsonFormatter(), + logging.StreamHandler.__class__.__name__: JsonFormatter(), + LoggingUDPSocketHandler.__class__.__name__: LineProtocolFormatter( + tags={"tardis", "resource_type", "pool_configuration", "pool_type"}, + resolution=1 + ) +} +collect_drone_cobald_statistics.name = "cobald_status" + + +def collect_pool_cobald_statistics(simulator: "Simulator") -> list: + results = [] for pool in simulator.pools: - pool_demand[repr(pool)] = pool.demand - pool_supply[repr(pool)] = pool.supply - pool_utilisation[repr(pool)] = pool.utilisation - pool_allocation[repr(pool)] = pool.allocation - return { - "pool": { - "demand": pool_demand, - "supply": pool_supply, - "allocation": pool_allocation, - "utilisation": pool_utilisation - } - } + results.append({ + "pool_configuration": None, + "pool_type": "pool", + "pool": repr(pool), + "allocation": pool.allocation, + "utilisation": pool.utilisation, + "demand": pool.demand, + "supply": pool.supply, + }) + return results + + +collect_pool_cobald_statistics.logging_formatter = { + LoggingSocketHandler.__class__.__name__: JsonFormatter(), + logging.StreamHandler.__class__.__name__: JsonFormatter(), + LoggingUDPSocketHandler.__class__.__name__: LineProtocolFormatter( + tags={"tardis", "pool_configuration", "pool_type"}, + resolution=1 + ) +} +collect_pool_cobald_statistics.name = "cobald_status" + From 46190c19856fb273d2643479607a5f3a192d95d0 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Thu, 27 Jun 2019 18:54:39 +0200 Subject: [PATCH 172/648] added missing import --- lapis/utility/monitor.py | 1 + 1 file changed, 1 insertion(+) diff --git a/lapis/utility/monitor.py b/lapis/utility/monitor.py index abdbf7d..fe2b738 100644 --- a/lapis/utility/monitor.py +++ b/lapis/utility/monitor.py @@ -2,6 +2,7 @@ from typing import Callable, TYPE_CHECKING import logging +import logging.handlers from cobald.monitor.format_json import JsonFormatter from cobald.monitor.format_line import LineProtocolFormatter From b621aadea230134a7a4de3f818100de2344c01de Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Thu, 27 Jun 2019 19:05:04 +0200 Subject: [PATCH 173/648] logging of new user demand now integrated in logging of user demand in general --- lapis/job.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/lapis/job.py b/lapis/job.py index 91c2084..62241a5 100644 --- a/lapis/job.py +++ b/lapis/job.py @@ -6,12 +6,13 @@ from usim import TaskCancelled -def job_demand(simulator): +async def job_demand(simulator): """ function randomly sets global user demand by using different strategies :param env: :return: """ + from lapis.utility.monitor import sampling_required while True: delay = random.randint(0, 100) strategy = random.random() @@ -31,7 +32,7 @@ def job_demand(simulator): value = round(value) if value > 0: simulator.global_demand.put(value) - logging.info("user_demand", {"user_demand_new": value}) + await sampling_required.set(True) class Job(object): @@ -117,6 +118,7 @@ def __repr__(self): async def job_to_queue_scheduler(job_generator, job_queue, **kwargs): + from lapis.utility.monitor import sampling_required job = next(job_generator) base_date = job.queue_date current_time = 0 @@ -133,6 +135,6 @@ async def job_to_queue_scheduler(job_generator, job_queue, **kwargs): job = None else: if count > 0: - logging.info("user_demand", {"user_demand_new": count}) + await sampling_required.set(True) count = 0 await (time == current_time) From a25c198953f978505b87d390befa091f2f2adc16 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Thu, 27 Jun 2019 19:13:09 +0200 Subject: [PATCH 174/648] made flake8 happy again --- lapis/simulator.py | 4 +--- lapis/utility/monitor.py | 5 +++-- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/lapis/simulator.py b/lapis/simulator.py index a1827cd..d57fc21 100644 --- a/lapis/simulator.py +++ b/lapis/simulator.py @@ -7,8 +7,7 @@ from lapis.drone import Drone from lapis.job import job_to_queue_scheduler from lapis.utility.monitor import Monitoring, collect_pool_cobald_statistics, \ - collect_user_demand, collect_job_statistics, collect_drone_cobald_statistics, \ - collect_cobald_cost + collect_user_demand, collect_job_statistics, collect_drone_cobald_statistics class Simulator(object): @@ -30,7 +29,6 @@ def enable_monitoring(self): self.monitoring.register_statistic(collect_user_demand) self.monitoring.register_statistic(collect_job_statistics) self.monitoring.register_statistic(collect_drone_cobald_statistics) - # self.monitoring.register_statistic(collect_cobald_cost) def create_job_generator(self, job_input, job_reader): self._job_generators.append((job_input, job_reader)) diff --git a/lapis/utility/monitor.py b/lapis/utility/monitor.py index fe2b738..44ea50b 100644 --- a/lapis/utility/monitor.py +++ b/lapis/utility/monitor.py @@ -76,8 +76,9 @@ def collect_resource_statistics(simulator: "Simulator") -> list: "pool_configuration": None, "pool_type": "drone", "pool": repr(drone), - "used_ratio": drone.used_resources.get(resource_type, 0) / - drone.resources.get(resource_type, 0) + "used_ratio": + drone.used_resources.get(resource_type, 0) + / drone.resources.get(resource_type, 0) }) return results From df99f80a54ae90f3162bd68c14181a3ea74ff874 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Thu, 27 Jun 2019 19:45:42 +0200 Subject: [PATCH 175/648] removed unused function for generating random number of user demands --- lapis/job.py | 31 ------------------------------- 1 file changed, 31 deletions(-) diff --git a/lapis/job.py b/lapis/job.py index 62241a5..f6aa47d 100644 --- a/lapis/job.py +++ b/lapis/job.py @@ -1,40 +1,9 @@ -import random -import math import logging from usim import time from usim import TaskCancelled -async def job_demand(simulator): - """ - function randomly sets global user demand by using different strategies - :param env: - :return: - """ - from lapis.utility.monitor import sampling_required - while True: - delay = random.randint(0, 100) - strategy = random.random() - if strategy < 1 / 3: - # linear amount - # print("strategy: linear amount") - amount = random.randint(0, int(random.random() * 100)) - elif strategy < 2 / 3: - # exponential amount - # print("strategy: exponential amount") - amount = (math.e ** (random.random()) - 1) * random.random() * 1000 - else: - # sqrt - # print("strategy: sqrt amount") - amount = math.sqrt(random.random() * random.random() * 100) - value = yield simulator.env.timeout(delay=delay, value=amount) - value = round(value) - if value > 0: - simulator.global_demand.put(value) - await sampling_required.set(True) - - class Job(object): __slots__ = ("resources", "used_resources", "walltime", "requested_walltime", "queue_date", "in_queue_since", "in_queue_until", "_name", "_success") From 65891abcce39c72f34b7714e07685faf6f835d54 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Thu, 27 Jun 2019 19:46:16 +0200 Subject: [PATCH 176/648] added stub functions for logging of information for pool status and configuration information --- lapis/utility/monitor.py | 43 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/lapis/utility/monitor.py b/lapis/utility/monitor.py index 44ea50b..2f28a34 100644 --- a/lapis/utility/monitor.py +++ b/lapis/utility/monitor.py @@ -193,3 +193,46 @@ def collect_pool_cobald_statistics(simulator: "Simulator") -> list: } collect_pool_cobald_statistics.name = "cobald_status" + +def collect_pool_status(simulator: "Simulator") -> list: + """ + Function takes care on logging information about when pools and drones + did change state within the system, e.g. were integrated or removed. + + :param simulator: the simulator + :return: list of records for logging + """ + pass + + +collect_pool_status.name = "pool_status" +collect_pool_status.logging_formatter = { + LoggingSocketHandler.__class__.__name__: JsonFormatter(), + logging.StreamHandler.__class__.__name__: JsonFormatter(), + LoggingUDPSocketHandler.__class__.__name__: LineProtocolFormatter( + tags={"tardis", "parent_pool", "pool_configuration", "pool_type"}, + resolution=1 + ) +} + + +def collect_configuration_information(simulator: "Simulator") -> list: + """ + Function takes care on logging information about the configuration of + pools and drones, e.g. provided resources. + + :param simulator: the simulator + :return: list of records for logging + """ + pass + + +collect_configuration_information.name = "configuration" +collect_configuration_information.logging_formatter = { + LoggingSocketHandler.__class__.__name__: JsonFormatter(), + logging.StreamHandler.__class__.__name__: JsonFormatter(), + LoggingUDPSocketHandler.__class__.__name__: LineProtocolFormatter( + tags={"tardis", "pool_configuration", "resource_type"}, + resolution=1 + ) +} From 992ff4f434e65de837f6ac7ea72e871adc610330 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Fri, 28 Jun 2019 10:57:38 +0200 Subject: [PATCH 177/648] removed definition to log costs --- lapis/utility/monitor.py | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/lapis/utility/monitor.py b/lapis/utility/monitor.py index 2f28a34..e58a862 100644 --- a/lapis/utility/monitor.py +++ b/lapis/utility/monitor.py @@ -8,8 +8,6 @@ from cobald.monitor.format_line import LineProtocolFormatter from usim import each, Flag, time -from lapis.cost import cobald_cost - if TYPE_CHECKING: from lapis.simulator import Simulator @@ -94,17 +92,6 @@ def collect_resource_statistics(simulator: "Simulator") -> list: collect_resource_statistics.name = "resource_status" -def collect_cobald_cost(simulator: "Simulator") -> dict: - current_cost = cobald_cost(simulator) - simulator.cost += current_cost - return { - "cobald_cost": { - "current": current_cost, - "accumulated": simulator.cost - } - } - - def collect_user_demand(simulator: "Simulator") -> list: return [{ "value": len(simulator.job_scheduler.job_queue) From c0c77bc852fbf3415d57dddf7a210fb5a5aaf895 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Mon, 1 Jul 2019 13:40:02 +0200 Subject: [PATCH 178/648] fixed propagation to root logger not to happen for monitoring of simulation --- lapis/utility/monitor.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/lapis/utility/monitor.py b/lapis/utility/monitor.py index e58a862..8880ee5 100644 --- a/lapis/utility/monitor.py +++ b/lapis/utility/monitor.py @@ -57,12 +57,15 @@ def register_statistic(self, statistic: Callable): # prepare the logger logger = logging.getLogger(statistic.name) if len(logger.handlers) == 0: + logger.addFilter(TimeFilter()) + logger.propagate = False # append handlers of default logger and add required formatters root_logger = logging.getLogger() for handler in root_logger.handlers: new_handler = copy.copy(handler) new_handler.setFormatter(statistic.logging_formatter.get( type(handler).__name__, JsonFormatter())) + logger.addHandler(new_handler) def collect_resource_statistics(simulator: "Simulator") -> list: From 721f52480b8549730f1fa054e8847900309fbd30 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Mon, 1 Jul 2019 13:41:57 +0200 Subject: [PATCH 179/648] changed pool and drone uuid to be served as tag --- lapis/utility/monitor.py | 76 ++++++++++++++++++++++------------------ 1 file changed, 42 insertions(+), 34 deletions(-) diff --git a/lapis/utility/monitor.py b/lapis/utility/monitor.py index 8880ee5..96ceb52 100644 --- a/lapis/utility/monitor.py +++ b/lapis/utility/monitor.py @@ -74,7 +74,7 @@ def collect_resource_statistics(simulator: "Simulator") -> list: for resource_type in {*drone.resources, *drone.used_resources}: results.append({ "resource_type": resource_type, - "pool_configuration": None, + "pool_configuration": "None", "pool_type": "drone", "pool": repr(drone), "used_ratio": @@ -84,15 +84,15 @@ def collect_resource_statistics(simulator: "Simulator") -> list: return results +collect_resource_statistics.name = "resource_status" collect_resource_statistics.logging_formatter = { - LoggingSocketHandler.__class__.__name__: JsonFormatter(), - logging.StreamHandler.__class__.__name__: JsonFormatter(), - LoggingUDPSocketHandler.__class__.__name__: LineProtocolFormatter( - tags={"tardis", "resource_type", "pool_configuration", "pool_type"}, + LoggingSocketHandler.__name__: JsonFormatter(), + logging.StreamHandler.__name__: JsonFormatter(), + LoggingUDPSocketHandler.__name__: LineProtocolFormatter( + tags={"tardis", "resource_type", "pool_configuration", "pool_type", "pool"}, resolution=1 ) } -collect_resource_statistics.name = "resource_status" def collect_user_demand(simulator: "Simulator") -> list: @@ -101,14 +101,15 @@ def collect_user_demand(simulator: "Simulator") -> list: }] +collect_user_demand.name = "user_demand" collect_user_demand.logging_formatter = { - LoggingSocketHandler.__class__.__name__: JsonFormatter(), - logging.StreamHandler.__class__.__name__: JsonFormatter(), - LoggingUDPSocketHandler.__class__.__name__: LineProtocolFormatter( + LoggingSocketHandler.__name__: JsonFormatter(), + logging.StreamHandler.__name__: JsonFormatter(), + LoggingUDPSocketHandler.__name__: LineProtocolFormatter( + tags={"tardis"}, resolution=1 ) } -collect_user_demand.name = "user_demand" def collect_job_statistics(simulator: "Simulator") -> list: @@ -120,22 +121,22 @@ def collect_job_statistics(simulator: "Simulator") -> list: }] +collect_job_statistics.name = "cobald_status" collect_job_statistics.logging_formatter = { - LoggingSocketHandler.__class__.__name__: JsonFormatter(), - logging.StreamHandler.__class__.__name__: JsonFormatter(), - LoggingUDPSocketHandler.__class__.__name__: LineProtocolFormatter( - tags={"tardis", "pool_configuration", "pool_type"}, + LoggingSocketHandler.__name__: JsonFormatter(), + logging.StreamHandler.__name__: JsonFormatter(), + LoggingUDPSocketHandler.__name__: LineProtocolFormatter( + tags={"tardis", "pool_configuration", "pool_type", "pool"}, resolution=1 ) } -collect_job_statistics.name = "cobald_status" def collect_drone_cobald_statistics(simulator: "Simulator") -> list: results = [] for drone in simulator.job_scheduler.drone_list: results.append({ - "pool_configuration": None, + "pool_configuration": "None", "pool_type": "drone", "pool": repr(drone), "allocation": drone.allocation, @@ -147,22 +148,29 @@ def collect_drone_cobald_statistics(simulator: "Simulator") -> list: return results +collect_drone_cobald_statistics.name = "cobald_status" collect_drone_cobald_statistics.logging_formatter = { - LoggingSocketHandler.__class__.__name__: JsonFormatter(), - logging.StreamHandler.__class__.__name__: JsonFormatter(), - LoggingUDPSocketHandler.__class__.__name__: LineProtocolFormatter( - tags={"tardis", "resource_type", "pool_configuration", "pool_type"}, + LoggingSocketHandler.__name__: JsonFormatter(), + logging.StreamHandler.__name__: JsonFormatter(), + LoggingUDPSocketHandler.__name__: LineProtocolFormatter( + tags={"tardis", "pool_configuration", "pool_type", "pool"}, resolution=1 ) } -collect_drone_cobald_statistics.name = "cobald_status" def collect_pool_cobald_statistics(simulator: "Simulator") -> list: + """ + Function takes care on collecting statistics of pools, i.e. allocation, + utilisation, demand, and supply. + + :param simulator: the simulator + :return: list of records to log + """ results = [] for pool in simulator.pools: results.append({ - "pool_configuration": None, + "pool_configuration": "None", "pool_type": "pool", "pool": repr(pool), "allocation": pool.allocation, @@ -173,15 +181,15 @@ def collect_pool_cobald_statistics(simulator: "Simulator") -> list: return results +collect_pool_cobald_statistics.name = "cobald_status" collect_pool_cobald_statistics.logging_formatter = { - LoggingSocketHandler.__class__.__name__: JsonFormatter(), - logging.StreamHandler.__class__.__name__: JsonFormatter(), - LoggingUDPSocketHandler.__class__.__name__: LineProtocolFormatter( - tags={"tardis", "pool_configuration", "pool_type"}, + LoggingSocketHandler.__name__: JsonFormatter(), + logging.StreamHandler.__name__: JsonFormatter(), + LoggingUDPSocketHandler.__name__: LineProtocolFormatter( + tags={"tardis", "pool_configuration", "pool_type", "pool"}, resolution=1 ) } -collect_pool_cobald_statistics.name = "cobald_status" def collect_pool_status(simulator: "Simulator") -> list: @@ -197,10 +205,10 @@ def collect_pool_status(simulator: "Simulator") -> list: collect_pool_status.name = "pool_status" collect_pool_status.logging_formatter = { - LoggingSocketHandler.__class__.__name__: JsonFormatter(), - logging.StreamHandler.__class__.__name__: JsonFormatter(), - LoggingUDPSocketHandler.__class__.__name__: LineProtocolFormatter( - tags={"tardis", "parent_pool", "pool_configuration", "pool_type"}, + LoggingSocketHandler.__name__: JsonFormatter(), + logging.StreamHandler.__name__: JsonFormatter(), + LoggingUDPSocketHandler.__name__: LineProtocolFormatter( + tags={"tardis", "parent_pool", "pool_configuration", "pool_type", "pool"}, resolution=1 ) } @@ -219,9 +227,9 @@ def collect_configuration_information(simulator: "Simulator") -> list: collect_configuration_information.name = "configuration" collect_configuration_information.logging_formatter = { - LoggingSocketHandler.__class__.__name__: JsonFormatter(), - logging.StreamHandler.__class__.__name__: JsonFormatter(), - LoggingUDPSocketHandler.__class__.__name__: LineProtocolFormatter( + LoggingSocketHandler.__name__: JsonFormatter(), + logging.StreamHandler.__name__: JsonFormatter(), + LoggingUDPSocketHandler.__name__: LineProtocolFormatter( tags={"tardis", "pool_configuration", "resource_type"}, resolution=1 ) From b948c962d48253a3e1de93240a9d5d73602e44ff Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Tue, 2 Jul 2019 12:47:57 +0200 Subject: [PATCH 180/648] fixed calculation when logging resource status --- lapis/utility/monitor.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/lapis/utility/monitor.py b/lapis/utility/monitor.py index 96ceb52..726b2f2 100644 --- a/lapis/utility/monitor.py +++ b/lapis/utility/monitor.py @@ -79,7 +79,10 @@ def collect_resource_statistics(simulator: "Simulator") -> list: "pool": repr(drone), "used_ratio": drone.used_resources.get(resource_type, 0) - / drone.resources.get(resource_type, 0) + / drone.pool_resources.get(resource_type, 0), + "requested_ratio": + drone.resources.get(resource_type, 0) + / drone.pool_resources.get(resource_type, 0) }) return results From 59c32fa63ddd16d333bcb29fc095be46ba14fca2 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Tue, 2 Jul 2019 12:48:28 +0200 Subject: [PATCH 181/648] included all functions for collecting information --- lapis/simulator.py | 8 ++++++-- lapis/utility/monitor.py | 4 ++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/lapis/simulator.py b/lapis/simulator.py index d57fc21..8e2d461 100644 --- a/lapis/simulator.py +++ b/lapis/simulator.py @@ -7,7 +7,8 @@ from lapis.drone import Drone from lapis.job import job_to_queue_scheduler from lapis.utility.monitor import Monitoring, collect_pool_cobald_statistics, \ - collect_user_demand, collect_job_statistics, collect_drone_cobald_statistics + collect_user_demand, collect_job_statistics, collect_drone_cobald_statistics, \ + collect_resource_statistics, collect_pool_status, collect_configuration_information class Simulator(object): @@ -25,10 +26,13 @@ def __init__(self, seed=1234): def enable_monitoring(self): self.monitoring = Monitoring(self) - self.monitoring.register_statistic(collect_pool_cobald_statistics) self.monitoring.register_statistic(collect_user_demand) self.monitoring.register_statistic(collect_job_statistics) + self.monitoring.register_statistic(collect_pool_cobald_statistics) self.monitoring.register_statistic(collect_drone_cobald_statistics) + self.monitoring.register_statistic(collect_resource_statistics) + self.monitoring.register_statistic(collect_pool_status) + self.monitoring.register_statistic(collect_configuration_information) def create_job_generator(self, job_input, job_reader): self._job_generators.append((job_input, job_reader)) diff --git a/lapis/utility/monitor.py b/lapis/utility/monitor.py index 726b2f2..2324c0d 100644 --- a/lapis/utility/monitor.py +++ b/lapis/utility/monitor.py @@ -203,7 +203,7 @@ def collect_pool_status(simulator: "Simulator") -> list: :param simulator: the simulator :return: list of records for logging """ - pass + return [] collect_pool_status.name = "pool_status" @@ -225,7 +225,7 @@ def collect_configuration_information(simulator: "Simulator") -> list: :param simulator: the simulator :return: list of records for logging """ - pass + return [] collect_configuration_information.name = "configuration" From 31da270e9b1fdb456bba016bc8fe002a001269c4 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Tue, 2 Jul 2019 15:22:33 +0200 Subject: [PATCH 182/648] moved usim utilities to __init__ of testing module --- lapis_tests/__init__.py | 48 +++++++++++++++++++++++++++++++++++++++++ lapis_tests/test_job.py | 2 +- lapis_tests/utility.py | 48 ----------------------------------------- 3 files changed, 49 insertions(+), 49 deletions(-) delete mode 100644 lapis_tests/utility.py diff --git a/lapis_tests/__init__.py b/lapis_tests/__init__.py index e69de29..630289d 100644 --- a/lapis_tests/__init__.py +++ b/lapis_tests/__init__.py @@ -0,0 +1,48 @@ +from typing import Callable, Coroutine +from functools import wraps + +from usim import run +from usim._core.loop import ActivityError + +from lapis.drone import Drone + + +def via_usim(test_case: Callable[..., Coroutine]): + """ + Mark an ``async def`` test case to be run via ``usim.run`` + + .. code:: python3 + + @via_usim + async def test_sleep(): + before = time.now + await (time + 20) + after = time.now + assert after - before == 20 + """ + @wraps(test_case) + def run_test(*args, **kwargs): + # pytest currently ignores __tracebackhide__ if we re-raise + # https://github.com/pytest-dev/pytest/issues/1904 + __tracebackhide__ = True + # >>> This is not the frame you are looking for. Do read on. <<< + try: + return run(test_case(*args, **kwargs)) + except ActivityError as err: + # unwrap any exceptions + raise err.__cause__ + return run_test + + +class DummyScheduler(): + @staticmethod + def register_drone(drone: Drone): + pass + + @staticmethod + def unregister_drone(drone: Drone): + pass + + @staticmethod + def update_drone(drone: Drone): + pass diff --git a/lapis_tests/test_job.py b/lapis_tests/test_job.py index 07cb554..12cc65f 100644 --- a/lapis_tests/test_job.py +++ b/lapis_tests/test_job.py @@ -3,7 +3,7 @@ from lapis.drone import Drone from lapis.job import Job -from lapis_tests.utility import via_usim, DummyScheduler +from lapis_tests import via_usim, DummyScheduler class TestJob(object): diff --git a/lapis_tests/utility.py b/lapis_tests/utility.py deleted file mode 100644 index 630289d..0000000 --- a/lapis_tests/utility.py +++ /dev/null @@ -1,48 +0,0 @@ -from typing import Callable, Coroutine -from functools import wraps - -from usim import run -from usim._core.loop import ActivityError - -from lapis.drone import Drone - - -def via_usim(test_case: Callable[..., Coroutine]): - """ - Mark an ``async def`` test case to be run via ``usim.run`` - - .. code:: python3 - - @via_usim - async def test_sleep(): - before = time.now - await (time + 20) - after = time.now - assert after - before == 20 - """ - @wraps(test_case) - def run_test(*args, **kwargs): - # pytest currently ignores __tracebackhide__ if we re-raise - # https://github.com/pytest-dev/pytest/issues/1904 - __tracebackhide__ = True - # >>> This is not the frame you are looking for. Do read on. <<< - try: - return run(test_case(*args, **kwargs)) - except ActivityError as err: - # unwrap any exceptions - raise err.__cause__ - return run_test - - -class DummyScheduler(): - @staticmethod - def register_drone(drone: Drone): - pass - - @staticmethod - def unregister_drone(drone: Drone): - pass - - @staticmethod - def update_drone(drone: Drone): - pass From ef158ff2fb1451c2ed6a06aea05dc744b7ad93d9 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Tue, 2 Jul 2019 15:23:18 +0200 Subject: [PATCH 183/648] added test for filtering of creation time of logs --- lapis_tests/utility/__init__.py | 32 ++++++++++++++++++++ lapis_tests/utility/test_monitor.py | 45 +++++++++++++++++++++++++++++ 2 files changed, 77 insertions(+) create mode 100644 lapis_tests/utility/__init__.py create mode 100644 lapis_tests/utility/test_monitor.py diff --git a/lapis_tests/utility/__init__.py b/lapis_tests/utility/__init__.py new file mode 100644 index 0000000..a90cf2b --- /dev/null +++ b/lapis_tests/utility/__init__.py @@ -0,0 +1,32 @@ +import io +import logging +import threading + + +class CapturingHandler(logging.StreamHandler): + @property + def content(self) -> str: + return self.stream.getvalue() + + def __init__(self): + super().__init__(stream=io.StringIO()) + + def clear(self): + self.stream.truncate(0) + self.stream.seek(0) + + +_test_index = 0 +_index_lock = threading.Lock() + + +def make_test_logger(base_name: str = 'test_logger'): + with _index_lock: + global _test_index + log_name = base_name + '.test%d' % _test_index + _test_index += 1 + logger = logging.getLogger(log_name) + logger.propagate = False + handler = CapturingHandler() + logger.handlers = [handler] + return logger, handler diff --git a/lapis_tests/utility/test_monitor.py b/lapis_tests/utility/test_monitor.py new file mode 100644 index 0000000..86ec184 --- /dev/null +++ b/lapis_tests/utility/test_monitor.py @@ -0,0 +1,45 @@ +import ast + +from cobald.monitor.format_line import LineProtocolFormatter +from usim import Scope, time, until, eternity + +from lapis_tests import via_usim + +from . import make_test_logger + +from lapis.utility.monitor import TimeFilter + + +def parse_line_protocol(literal: str): + name_tags, _, fields_stamp = literal.strip().partition(' ') + fields, _, stamp = fields_stamp.partition(' ') + fields = fields.split(',') if fields else [] + name, *tags = name_tags.split(',') + return name, { + key: value + for key, value + in (tag.split('=') for tag in tags) + }, { + key: ast.literal_eval(value) + for key, value + in (field.split('=') for field in fields) + }, None if not stamp else int(stamp) + + +class TestTimeFilter(object): + @via_usim + async def test_simple(self): + payload = {"a": "a"} + logger, handler = make_test_logger(__name__) + handler.formatter = LineProtocolFormatter(resolution=1) + logger.addFilter(TimeFilter()) + async with Scope() as _: + logger.critical("message", payload) + _, _, _, timestamp = parse_line_protocol(handler.content) + handler.clear() + assert timestamp == 0 + async with until(time == 10): + await eternity + logger.critical("message", payload) + _, _, _, timestamp = parse_line_protocol(handler.content) + assert timestamp == 10000000000 From 1005f00517bf7b7cf03efed610a45e61bb08b559 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Tue, 2 Jul 2019 15:57:53 +0200 Subject: [PATCH 184/648] added test for monitoring class --- lapis_tests/utility/test_monitor.py | 37 +++++++++++++++++++++++++++-- 1 file changed, 35 insertions(+), 2 deletions(-) diff --git a/lapis_tests/utility/test_monitor.py b/lapis_tests/utility/test_monitor.py index 86ec184..63ae0f7 100644 --- a/lapis_tests/utility/test_monitor.py +++ b/lapis_tests/utility/test_monitor.py @@ -1,13 +1,15 @@ import ast +import pytest +from time import time as pytime from cobald.monitor.format_line import LineProtocolFormatter from usim import Scope, time, until, eternity -from lapis_tests import via_usim +from lapis_tests import via_usim, DummyScheduler from . import make_test_logger -from lapis.utility.monitor import TimeFilter +from lapis.utility.monitor import TimeFilter, Monitoring, collect_resource_statistics def parse_line_protocol(literal: str): @@ -43,3 +45,34 @@ async def test_simple(self): logger.critical("message", payload) _, _, _, timestamp = parse_line_protocol(handler.content) assert timestamp == 10000000000 + + @via_usim + async def test_explicit(self): + def record(): + pass + record.created = pytime() + filter = TimeFilter() + async with Scope() as _: + filter.filter(record) + assert record.created == 0 + + +def dummy_statistics(): + return [] + + +class TestMonitoring(object): + def test_registration(self): + scheduler = DummyScheduler() + monitoring = Monitoring(scheduler) + statistics = collect_resource_statistics + monitoring.register_statistic(statistics) + assert statistics in monitoring._statistics + + def test_registration_failure(self): + scheduler = DummyScheduler() + monitoring = Monitoring(scheduler) + statistics = dummy_statistics + with pytest.raises(AssertionError): + monitoring.register_statistic(statistics) + assert statistics not in monitoring._statistics From ce1cb18d0d27edee493f7a0585f72f55d5a2849a Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Tue, 2 Jul 2019 17:19:03 +0200 Subject: [PATCH 185/648] Apply suggestions from code review Co-Authored-By: Max Fischer --- lapis/utility/monitor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapis/utility/monitor.py b/lapis/utility/monitor.py index 2324c0d..c346834 100644 --- a/lapis/utility/monitor.py +++ b/lapis/utility/monitor.py @@ -56,7 +56,7 @@ def register_statistic(self, statistic: Callable): # prepare the logger logger = logging.getLogger(statistic.name) - if len(logger.handlers) == 0: + if not logger.handlers: logger.addFilter(TimeFilter()) logger.propagate = False # append handlers of default logger and add required formatters From 3e157b713b38019c3da1f54535c49bab0a3008db Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Tue, 2 Jul 2019 17:47:54 +0200 Subject: [PATCH 186/648] changes requested from review for pr --- lapis/cli/simulate.py | 3 +- lapis/drone.py | 6 +- lapis/job.py | 3 +- lapis/monitor/__init__.py | 66 ++++++++ lapis/monitor/cobald.py | 75 +++++++++ lapis/monitor/general.py | 138 ++++++++++++++++ lapis/simulator.py | 21 +-- lapis/utility/__init__.py | 0 lapis/utility/monitor.py | 239 ---------------------------- lapis_tests/utility/test_monitor.py | 13 +- 10 files changed, 302 insertions(+), 262 deletions(-) create mode 100644 lapis/monitor/__init__.py create mode 100644 lapis/monitor/cobald.py create mode 100644 lapis/monitor/general.py delete mode 100644 lapis/utility/__init__.py delete mode 100644 lapis/utility/monitor.py diff --git a/lapis/cli/simulate.py b/lapis/cli/simulate.py index 020b3a5..a8d3af8 100644 --- a/lapis/cli/simulate.py +++ b/lapis/cli/simulate.py @@ -13,8 +13,7 @@ from lapis.scheduler import CondorJobScheduler from lapis.simulator import Simulator -from lapis.utility.monitor import TimeFilter, LoggingSocketHandler, \ - LoggingUDPSocketHandler +from lapis.monitor import LoggingSocketHandler, LoggingUDPSocketHandler, TimeFilter last_step = 0 diff --git a/lapis/drone.py b/lapis/drone.py index f8ab250..f283652 100644 --- a/lapis/drone.py +++ b/lapis/drone.py @@ -53,7 +53,7 @@ def theoretical_available_resources(self): } async def run(self): - from lapis.utility.monitor import sampling_required + from lapis.monitor import sampling_required await (time + self.scheduling_duration) self._supply = 1 self.scheduler.register_drone(self) @@ -92,7 +92,7 @@ def _init_allocation_and_utilisation(self): self._utilisation = min(resources) async def shutdown(self): - from lapis.utility.monitor import sampling_required + from lapis.monitor import sampling_required self._supply = 0 self.scheduler.unregister_drone(self) await sampling_required.set(True) @@ -136,7 +136,7 @@ async def start_job(self, job: Job, kill: bool = False): # TODO: ensure that jobs cannot immediately started on the same drone # until the jobs did not allocate resources async with Scope() as scope: - from lapis.utility.monitor import sampling_required + from lapis.monitor import sampling_required self._utilisation = self._allocation = None job_execution = scope.do(job.run()) diff --git a/lapis/job.py b/lapis/job.py index f6aa47d..d44b82b 100644 --- a/lapis/job.py +++ b/lapis/job.py @@ -3,6 +3,8 @@ from usim import time from usim import TaskCancelled +from lapis.monitor import sampling_required + class Job(object): __slots__ = ("resources", "used_resources", "walltime", "requested_walltime", @@ -87,7 +89,6 @@ def __repr__(self): async def job_to_queue_scheduler(job_generator, job_queue, **kwargs): - from lapis.utility.monitor import sampling_required job = next(job_generator) base_date = job.queue_date current_time = 0 diff --git a/lapis/monitor/__init__.py b/lapis/monitor/__init__.py new file mode 100644 index 0000000..7d5cda4 --- /dev/null +++ b/lapis/monitor/__init__.py @@ -0,0 +1,66 @@ +import copy +import logging +import logging.handlers +from typing import Callable, TYPE_CHECKING + +from cobald.monitor.format_json import JsonFormatter +from usim import time, Flag, each + +if TYPE_CHECKING: + from lapis.simulator import Simulator + + +class LoggingSocketHandler(logging.handlers.SocketHandler): + def makePickle(self, record): + return self.format(record).encode() + + +class LoggingUDPSocketHandler(logging.handlers.DatagramHandler): + def makePickle(self, record): + return self.format(record).encode() + + +class TimeFilter(logging.Filter): + """ + Dummy filter to replace log record timestamp with simulation time. + """ + def filter(self, record) -> bool: + record.created = time.now + return True + + +sampling_required = Flag() + + +class Monitoring(object): + def __init__(self, simulator: "Simulator"): + self.simulator = simulator + self._statistics = [] + + async def run(self): + async for _ in each(delay=1): + await sampling_required + await sampling_required.set(False) + for statistic in self._statistics: + # do the logging + for record in statistic(self.simulator): + logging.getLogger(statistic.name).info( + statistic.name, record + ) + + def register_statistic(self, statistic: Callable): + assert hasattr(statistic, "name") and hasattr(statistic, "logging_formatter") + self._statistics.append(statistic) + + # prepare the logger + logger = logging.getLogger(statistic.name) + if len(logger.handlers) == 0: + logger.addFilter(TimeFilter()) + logger.propagate = False + # append handlers of default logger and add required formatters + root_logger = logging.getLogger() + for handler in root_logger.handlers: + new_handler = copy.copy(handler) + new_handler.setFormatter(statistic.logging_formatter.get( + type(handler).__name__, JsonFormatter())) + logger.addHandler(new_handler) diff --git a/lapis/monitor/cobald.py b/lapis/monitor/cobald.py new file mode 100644 index 0000000..3c6c879 --- /dev/null +++ b/lapis/monitor/cobald.py @@ -0,0 +1,75 @@ +import logging + +from cobald.monitor.format_json import JsonFormatter +from cobald.monitor.format_line import LineProtocolFormatter +from typing import TYPE_CHECKING + +from lapis.monitor import LoggingSocketHandler, LoggingUDPSocketHandler + +if TYPE_CHECKING: + from lapis.simulator import Simulator + + +def drone_statistics(simulator: "Simulator") -> list: + """ + Collect allocation, utilisation, demand and supply of drones. + + :param simulator: the simulator + :return: list of records for logging + """ + results = [] + for drone in simulator.job_scheduler.drone_list: + results.append({ + "pool_configuration": "None", + "pool_type": "drone", + "pool": repr(drone), + "allocation": drone.allocation, + "utilisation": drone.utilisation, + "demand": drone.demand, + "supply": drone.supply, + "job_count": drone.jobs + }) + return results + + +drone_statistics.name = "cobald_status" +drone_statistics.logging_formatter = { + LoggingSocketHandler.__name__: JsonFormatter(), + logging.StreamHandler.__name__: JsonFormatter(), + LoggingUDPSocketHandler.__name__: LineProtocolFormatter( + tags={"tardis", "pool_configuration", "pool_type", "pool"}, + resolution=1 + ) +} + + +def pool_statistics(simulator: "Simulator") -> list: + """ + Collect allocation, utilisation, demand and supply of pools. + + :param simulator: the simulator + :return: list of records to log + """ + results = [] + for pool in simulator.pools: + results.append({ + "pool_configuration": "None", + "pool_type": "pool", + "pool": repr(pool), + "allocation": pool.allocation, + "utilisation": pool.utilisation, + "demand": pool.demand, + "supply": pool.supply, + }) + return results + + +pool_statistics.name = "cobald_status" +pool_statistics.logging_formatter = { + LoggingSocketHandler.__name__: JsonFormatter(), + logging.StreamHandler.__name__: JsonFormatter(), + LoggingUDPSocketHandler.__name__: LineProtocolFormatter( + tags={"tardis", "pool_configuration", "pool_type", "pool"}, + resolution=1 + ) +} diff --git a/lapis/monitor/general.py b/lapis/monitor/general.py new file mode 100644 index 0000000..7d49981 --- /dev/null +++ b/lapis/monitor/general.py @@ -0,0 +1,138 @@ +from typing import TYPE_CHECKING + +import logging.handlers + +from cobald.monitor.format_json import JsonFormatter +from cobald.monitor.format_line import LineProtocolFormatter + +from lapis.monitor import LoggingSocketHandler, LoggingUDPSocketHandler + +if TYPE_CHECKING: + from lapis.simulator import Simulator + + +def resource_statistics(simulator: "Simulator") -> list: + """ + Log ratio of used and requested resources for drones. + + :param simulator: the simulator + :return: list of records for logging + """ + results = [] + for drone in simulator.job_scheduler.drone_list: + for resource_type in {*drone.resources, *drone.used_resources}: + results.append({ + "resource_type": resource_type, + "pool_configuration": "None", + "pool_type": "drone", + "pool": repr(drone), + "used_ratio": + drone.used_resources.get(resource_type, 0) + / drone.pool_resources.get(resource_type, 0), + "requested_ratio": + drone.resources.get(resource_type, 0) + / drone.pool_resources.get(resource_type, 0) + }) + return results + + +resource_statistics.name = "resource_status" +resource_statistics.logging_formatter = { + LoggingSocketHandler.__name__: JsonFormatter(), + logging.StreamHandler.__name__: JsonFormatter(), + LoggingUDPSocketHandler.__name__: LineProtocolFormatter( + tags={"tardis", "resource_type", "pool_configuration", "pool_type", "pool"}, + resolution=1 + ) +} + + +def user_demand(simulator: "Simulator") -> list: + """ + Log global user demand. + + :param simulator: the simulator + :return: list of records for logging + """ + return [{ + "value": len(simulator.job_scheduler.job_queue) + }] + + +user_demand.name = "user_demand" +user_demand.logging_formatter = { + LoggingSocketHandler.__name__: JsonFormatter(), + logging.StreamHandler.__name__: JsonFormatter(), + LoggingUDPSocketHandler.__name__: LineProtocolFormatter( + tags={"tardis"}, + resolution=1 + ) +} + + +def job_statistics(simulator: "Simulator") -> list: + """ + Log number of jobs running in a drone. + + :param simulator: the simulator + :return: list of records for logging + """ + result = 0 + for drone in simulator.job_scheduler.drone_list: + result += drone.jobs + return [{ + "job_count": result + }] + + +job_statistics.name = "cobald_status" +job_statistics.logging_formatter = { + LoggingSocketHandler.__name__: JsonFormatter(), + logging.StreamHandler.__name__: JsonFormatter(), + LoggingUDPSocketHandler.__name__: LineProtocolFormatter( + tags={"tardis", "pool_configuration", "pool_type", "pool"}, + resolution=1 + ) +} + + +def pool_status(simulator: "Simulator") -> list: + """ + Log state changes of pools and drones. + + :param simulator: the simulator + :return: list of records for logging + """ + return [] + + +pool_status.name = "pool_status" +pool_status.logging_formatter = { + LoggingSocketHandler.__name__: JsonFormatter(), + logging.StreamHandler.__name__: JsonFormatter(), + LoggingUDPSocketHandler.__name__: LineProtocolFormatter( + tags={"tardis", "parent_pool", "pool_configuration", "pool_type", "pool"}, + resolution=1 + ) +} + + +def configuration_information(simulator: "Simulator") -> list: + """ + Log information how pools and drones are configured, e.g. provided resources. + + :param simulator: the simulator + :return: list of records for logging + """ + return [] + + +configuration_information.name = "configuration" +configuration_information.logging_formatter = { + LoggingSocketHandler.__name__: JsonFormatter(), + logging.StreamHandler.__name__: JsonFormatter(), + LoggingUDPSocketHandler.__name__: LineProtocolFormatter( + tags={"tardis", "pool_configuration", "resource_type"}, + resolution=1 + ) +} diff --git a/lapis/simulator.py b/lapis/simulator.py index 8e2d461..06780c2 100644 --- a/lapis/simulator.py +++ b/lapis/simulator.py @@ -6,9 +6,10 @@ from lapis.drone import Drone from lapis.job import job_to_queue_scheduler -from lapis.utility.monitor import Monitoring, collect_pool_cobald_statistics, \ - collect_user_demand, collect_job_statistics, collect_drone_cobald_statistics, \ - collect_resource_statistics, collect_pool_status, collect_configuration_information +from lapis.monitor.general import user_demand, job_statistics, \ + resource_statistics, pool_status, configuration_information +from lapis.monitor import Monitoring +from lapis.monitor.cobald import drone_statistics, pool_statistics class Simulator(object): @@ -26,13 +27,13 @@ def __init__(self, seed=1234): def enable_monitoring(self): self.monitoring = Monitoring(self) - self.monitoring.register_statistic(collect_user_demand) - self.monitoring.register_statistic(collect_job_statistics) - self.monitoring.register_statistic(collect_pool_cobald_statistics) - self.monitoring.register_statistic(collect_drone_cobald_statistics) - self.monitoring.register_statistic(collect_resource_statistics) - self.monitoring.register_statistic(collect_pool_status) - self.monitoring.register_statistic(collect_configuration_information) + self.monitoring.register_statistic(user_demand) + self.monitoring.register_statistic(job_statistics) + self.monitoring.register_statistic(pool_statistics) + self.monitoring.register_statistic(drone_statistics) + self.monitoring.register_statistic(resource_statistics) + self.monitoring.register_statistic(pool_status) + self.monitoring.register_statistic(configuration_information) def create_job_generator(self, job_input, job_reader): self._job_generators.append((job_input, job_reader)) diff --git a/lapis/utility/__init__.py b/lapis/utility/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/lapis/utility/monitor.py b/lapis/utility/monitor.py deleted file mode 100644 index 2324c0d..0000000 --- a/lapis/utility/monitor.py +++ /dev/null @@ -1,239 +0,0 @@ -import copy -from typing import Callable, TYPE_CHECKING - -import logging -import logging.handlers - -from cobald.monitor.format_json import JsonFormatter -from cobald.monitor.format_line import LineProtocolFormatter -from usim import each, Flag, time - -if TYPE_CHECKING: - from lapis.simulator import Simulator - -sampling_required = Flag() - - -class LoggingSocketHandler(logging.handlers.SocketHandler): - def makePickle(self, record): - return self.format(record).encode() - - -class LoggingUDPSocketHandler(logging.handlers.DatagramHandler): - def makePickle(self, record): - return self.format(record).encode() - - -class TimeFilter(logging.Filter): - """ - py:class:`TimeFilter` takes care to modify the created timestamp of a log - record to be set to the current simulation time. - """ - def filter(self, record) -> bool: - record.created = time.now - return True - - -class Monitoring(object): - def __init__(self, simulator: "Simulator"): - self.simulator = simulator - self._statistics = [] - - async def run(self): - async for _ in each(delay=1): - await sampling_required - await sampling_required.set(False) - for statistic in self._statistics: - # do the logging - for record in statistic(self.simulator): - logging.getLogger(statistic.name).info( - statistic.name, record - ) - - def register_statistic(self, statistic: Callable): - assert hasattr(statistic, "name") and hasattr(statistic, "logging_formatter") - self._statistics.append(statistic) - - # prepare the logger - logger = logging.getLogger(statistic.name) - if len(logger.handlers) == 0: - logger.addFilter(TimeFilter()) - logger.propagate = False - # append handlers of default logger and add required formatters - root_logger = logging.getLogger() - for handler in root_logger.handlers: - new_handler = copy.copy(handler) - new_handler.setFormatter(statistic.logging_formatter.get( - type(handler).__name__, JsonFormatter())) - logger.addHandler(new_handler) - - -def collect_resource_statistics(simulator: "Simulator") -> list: - results = [] - for drone in simulator.job_scheduler.drone_list: - for resource_type in {*drone.resources, *drone.used_resources}: - results.append({ - "resource_type": resource_type, - "pool_configuration": "None", - "pool_type": "drone", - "pool": repr(drone), - "used_ratio": - drone.used_resources.get(resource_type, 0) - / drone.pool_resources.get(resource_type, 0), - "requested_ratio": - drone.resources.get(resource_type, 0) - / drone.pool_resources.get(resource_type, 0) - }) - return results - - -collect_resource_statistics.name = "resource_status" -collect_resource_statistics.logging_formatter = { - LoggingSocketHandler.__name__: JsonFormatter(), - logging.StreamHandler.__name__: JsonFormatter(), - LoggingUDPSocketHandler.__name__: LineProtocolFormatter( - tags={"tardis", "resource_type", "pool_configuration", "pool_type", "pool"}, - resolution=1 - ) -} - - -def collect_user_demand(simulator: "Simulator") -> list: - return [{ - "value": len(simulator.job_scheduler.job_queue) - }] - - -collect_user_demand.name = "user_demand" -collect_user_demand.logging_formatter = { - LoggingSocketHandler.__name__: JsonFormatter(), - logging.StreamHandler.__name__: JsonFormatter(), - LoggingUDPSocketHandler.__name__: LineProtocolFormatter( - tags={"tardis"}, - resolution=1 - ) -} - - -def collect_job_statistics(simulator: "Simulator") -> list: - result = 0 - for drone in simulator.job_scheduler.drone_list: - result += drone.jobs - return [{ - "job_count": result - }] - - -collect_job_statistics.name = "cobald_status" -collect_job_statistics.logging_formatter = { - LoggingSocketHandler.__name__: JsonFormatter(), - logging.StreamHandler.__name__: JsonFormatter(), - LoggingUDPSocketHandler.__name__: LineProtocolFormatter( - tags={"tardis", "pool_configuration", "pool_type", "pool"}, - resolution=1 - ) -} - - -def collect_drone_cobald_statistics(simulator: "Simulator") -> list: - results = [] - for drone in simulator.job_scheduler.drone_list: - results.append({ - "pool_configuration": "None", - "pool_type": "drone", - "pool": repr(drone), - "allocation": drone.allocation, - "utilisation": drone.utilisation, - "demand": drone.demand, - "supply": drone.supply, - "job_count": drone.jobs - }) - return results - - -collect_drone_cobald_statistics.name = "cobald_status" -collect_drone_cobald_statistics.logging_formatter = { - LoggingSocketHandler.__name__: JsonFormatter(), - logging.StreamHandler.__name__: JsonFormatter(), - LoggingUDPSocketHandler.__name__: LineProtocolFormatter( - tags={"tardis", "pool_configuration", "pool_type", "pool"}, - resolution=1 - ) -} - - -def collect_pool_cobald_statistics(simulator: "Simulator") -> list: - """ - Function takes care on collecting statistics of pools, i.e. allocation, - utilisation, demand, and supply. - - :param simulator: the simulator - :return: list of records to log - """ - results = [] - for pool in simulator.pools: - results.append({ - "pool_configuration": "None", - "pool_type": "pool", - "pool": repr(pool), - "allocation": pool.allocation, - "utilisation": pool.utilisation, - "demand": pool.demand, - "supply": pool.supply, - }) - return results - - -collect_pool_cobald_statistics.name = "cobald_status" -collect_pool_cobald_statistics.logging_formatter = { - LoggingSocketHandler.__name__: JsonFormatter(), - logging.StreamHandler.__name__: JsonFormatter(), - LoggingUDPSocketHandler.__name__: LineProtocolFormatter( - tags={"tardis", "pool_configuration", "pool_type", "pool"}, - resolution=1 - ) -} - - -def collect_pool_status(simulator: "Simulator") -> list: - """ - Function takes care on logging information about when pools and drones - did change state within the system, e.g. were integrated or removed. - - :param simulator: the simulator - :return: list of records for logging - """ - return [] - - -collect_pool_status.name = "pool_status" -collect_pool_status.logging_formatter = { - LoggingSocketHandler.__name__: JsonFormatter(), - logging.StreamHandler.__name__: JsonFormatter(), - LoggingUDPSocketHandler.__name__: LineProtocolFormatter( - tags={"tardis", "parent_pool", "pool_configuration", "pool_type", "pool"}, - resolution=1 - ) -} - - -def collect_configuration_information(simulator: "Simulator") -> list: - """ - Function takes care on logging information about the configuration of - pools and drones, e.g. provided resources. - - :param simulator: the simulator - :return: list of records for logging - """ - return [] - - -collect_configuration_information.name = "configuration" -collect_configuration_information.logging_formatter = { - LoggingSocketHandler.__name__: JsonFormatter(), - logging.StreamHandler.__name__: JsonFormatter(), - LoggingUDPSocketHandler.__name__: LineProtocolFormatter( - tags={"tardis", "pool_configuration", "resource_type"}, - resolution=1 - ) -} diff --git a/lapis_tests/utility/test_monitor.py b/lapis_tests/utility/test_monitor.py index 63ae0f7..6355c44 100644 --- a/lapis_tests/utility/test_monitor.py +++ b/lapis_tests/utility/test_monitor.py @@ -3,13 +3,14 @@ from time import time as pytime from cobald.monitor.format_line import LineProtocolFormatter -from usim import Scope, time, until, eternity +from usim import Scope, time from lapis_tests import via_usim, DummyScheduler from . import make_test_logger -from lapis.utility.monitor import TimeFilter, Monitoring, collect_resource_statistics +from lapis.monitor.general import resource_statistics +from lapis.monitor import TimeFilter, Monitoring def parse_line_protocol(literal: str): @@ -35,13 +36,11 @@ async def test_simple(self): logger, handler = make_test_logger(__name__) handler.formatter = LineProtocolFormatter(resolution=1) logger.addFilter(TimeFilter()) - async with Scope() as _: - logger.critical("message", payload) + logger.critical("message", payload) _, _, _, timestamp = parse_line_protocol(handler.content) handler.clear() assert timestamp == 0 - async with until(time == 10): - await eternity + await (time + 10) logger.critical("message", payload) _, _, _, timestamp = parse_line_protocol(handler.content) assert timestamp == 10000000000 @@ -65,7 +64,7 @@ class TestMonitoring(object): def test_registration(self): scheduler = DummyScheduler() monitoring = Monitoring(scheduler) - statistics = collect_resource_statistics + statistics = resource_statistics monitoring.register_statistic(statistics) assert statistics in monitoring._statistics From 6a11991f5416388f7c6e714064423496eff7ad2d Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Tue, 2 Jul 2019 17:57:46 +0200 Subject: [PATCH 187/648] renamed class TimeFilter to SimulationTimeFilter --- lapis/cli/simulate.py | 5 +++-- lapis/monitor/__init__.py | 4 ++-- lapis_tests/utility/test_monitor.py | 8 ++++---- 3 files changed, 9 insertions(+), 8 deletions(-) diff --git a/lapis/cli/simulate.py b/lapis/cli/simulate.py index a8d3af8..2c954ad 100644 --- a/lapis/cli/simulate.py +++ b/lapis/cli/simulate.py @@ -13,7 +13,8 @@ from lapis.scheduler import CondorJobScheduler from lapis.simulator import Simulator -from lapis.monitor import LoggingSocketHandler, LoggingUDPSocketHandler, TimeFilter +from lapis.monitor import LoggingSocketHandler, LoggingUDPSocketHandler, \ + SimulationTimeFilter last_step = 0 @@ -40,7 +41,7 @@ def cli(ctx, seed, until, log_tcp, log_file, log_telegraf): ctx.obj['until'] = until monitoring_logger = logging.getLogger() monitoring_logger.setLevel(logging.DEBUG) - time_filter = TimeFilter() + time_filter = SimulationTimeFilter() monitoring_logger.addFilter(time_filter) if log_tcp: socketHandler = LoggingSocketHandler( diff --git a/lapis/monitor/__init__.py b/lapis/monitor/__init__.py index 5e8f1ca..6231428 100644 --- a/lapis/monitor/__init__.py +++ b/lapis/monitor/__init__.py @@ -20,7 +20,7 @@ def makePickle(self, record): return self.format(record).encode() -class TimeFilter(logging.Filter): +class SimulationTimeFilter(logging.Filter): """ Dummy filter to replace log record timestamp with simulation time. """ @@ -55,7 +55,7 @@ def register_statistic(self, statistic: Callable): # prepare the logger logger = logging.getLogger(statistic.name) if not logger.handlers: - logger.addFilter(TimeFilter()) + logger.addFilter(SimulationTimeFilter()) logger.propagate = False # append handlers of default logger and add required formatters root_logger = logging.getLogger() diff --git a/lapis_tests/utility/test_monitor.py b/lapis_tests/utility/test_monitor.py index 6355c44..6d048fa 100644 --- a/lapis_tests/utility/test_monitor.py +++ b/lapis_tests/utility/test_monitor.py @@ -10,7 +10,7 @@ from . import make_test_logger from lapis.monitor.general import resource_statistics -from lapis.monitor import TimeFilter, Monitoring +from lapis.monitor import SimulationTimeFilter, Monitoring def parse_line_protocol(literal: str): @@ -29,13 +29,13 @@ def parse_line_protocol(literal: str): }, None if not stamp else int(stamp) -class TestTimeFilter(object): +class TestSimulationTimeFilter(object): @via_usim async def test_simple(self): payload = {"a": "a"} logger, handler = make_test_logger(__name__) handler.formatter = LineProtocolFormatter(resolution=1) - logger.addFilter(TimeFilter()) + logger.addFilter(SimulationTimeFilter()) logger.critical("message", payload) _, _, _, timestamp = parse_line_protocol(handler.content) handler.clear() @@ -50,7 +50,7 @@ async def test_explicit(self): def record(): pass record.created = pytime() - filter = TimeFilter() + filter = SimulationTimeFilter() async with Scope() as _: filter.filter(record) assert record.created == 0 From d496e892f753c887ed26b3abe3c4406b941d2112 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Wed, 4 Sep 2019 19:12:27 +0200 Subject: [PATCH 188/648] added usim from pypi to setup.py --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 0293bb8..9485f2c 100644 --- a/setup.py +++ b/setup.py @@ -25,7 +25,7 @@ # dependencies install_requires=[ 'cobald', - 'usim@git+https://github.com/MaineKuehn/usim.git@master#egg=usim-0.1.0', + 'usim', 'click' ], extras_require={ From b14a638adaf0fdc9376d0825f5d3cc907e12b8f8 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Wed, 4 Sep 2019 20:05:32 +0200 Subject: [PATCH 189/648] adapted python version for testing --- .travis.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index 398bdd2..ba411fd 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,9 +1,9 @@ +dist: xenial language: python python: - - "3.5" - "3.6" + - "3.7" - "pypy3" - - "3.7-dev" os: - linux # - osx # osx+python installation fails From 7ec033eecb1ca0f5e2ed1c0c0b338ffd880616eb Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Fri, 6 Sep 2019 13:48:34 +0200 Subject: [PATCH 190/648] adapted framework to current features of usim --- lapis/drone.py | 98 ++++++++++++++-------------------------- lapis/job.py | 6 +++ lapis/monitor/general.py | 8 ++-- lapis/scheduler.py | 15 +++--- 4 files changed, 54 insertions(+), 73 deletions(-) diff --git a/lapis/drone.py b/lapis/drone.py index f283652..99c0cf6 100644 --- a/lapis/drone.py +++ b/lapis/drone.py @@ -2,6 +2,7 @@ from cobald import interfaces from usim import time, Scope, instant, TaskState +from usim.basics import Capacities, ResourcesUnavailable from lapis.job import Job @@ -24,7 +25,9 @@ def __init__(self, scheduler, pool_resources: dict, super(Drone, self).__init__() self.scheduler = scheduler self.pool_resources = pool_resources - self.resources = {resource: 0 for resource in self.pool_resources} + self.resources = Capacities(**pool_resources) + # shadowing requested resources to determine jobs to be killed + self.used_resources = Capacities(**pool_resources) if ignore_resources: self._valid_resource_keys = [ resource for resource in self.pool_resources @@ -32,8 +35,6 @@ def __init__(self, scheduler, pool_resources: dict, ] else: self._valid_resource_keys = self.pool_resources.keys() - # shadowing requested resources to determine jobs to be killed - self.used_resources = {resource: 0 for resource in self.pool_resources} self.scheduling_duration = scheduling_duration if scheduling_duration == 0: self._supply = 1 @@ -47,9 +48,16 @@ def __init__(self, scheduler, pool_resources: dict, @property def theoretical_available_resources(self): + levels = self.resources.levels return { - key: self.pool_resources[key] - self.resources[key] - for key in self.pool_resources + key: getattr(levels, key) for key in self.pool_resources + } + + @property + def available_resources(self): + levels = self.used_resources.levels + return { + key: getattr(levels, key) for key in self.pool_resources } async def run(self): @@ -84,10 +92,11 @@ def allocation(self) -> float: return self._allocation def _init_allocation_and_utilisation(self): + levels = self.resources.levels resources = [] for resource_key in self._valid_resource_keys: resources.append( - self.resources[resource_key] / self.pool_resources[resource_key]) + getattr(levels, resource_key) / self.pool_resources[resource_key]) self._allocation = max(resources) self._utilisation = min(resources) @@ -99,29 +108,6 @@ async def shutdown(self): await (time + 1) # print("[drone %s] has been shut down" % self) - def _add_resources(self, keys: list, target: dict, source: dict, - alternative_source: dict): - resources_exceeded = False - for resource_key in keys: - try: - value = target[resource_key] + source[resource_key] - except KeyError: - value = target[resource_key] + alternative_source[resource_key] - if value > self.pool_resources[resource_key]: - resources_exceeded = True - target[resource_key] = value - if resources_exceeded: - raise ResourcesExceeded() - - @staticmethod - def _remove_resources(keys: list, target: dict, source: dict, - alternative_source: dict): - for resource_key in keys: - try: - target[resource_key] -= source[resource_key] - except KeyError: - target[resource_key] -= alternative_source[resource_key] - async def start_job(self, job: Job, kill: bool = False): """ Method manages to start a job in the context of the given drone. @@ -133,45 +119,37 @@ async def start_job(self, job: Job, kill: bool = False): requested resources :return: """ - # TODO: ensure that jobs cannot immediately started on the same drone - # until the jobs did not allocate resources async with Scope() as scope: from lapis.monitor import sampling_required self._utilisation = self._allocation = None job_execution = scope.do(job.run()) await instant # waiting just a moment to enable job to set parameters - job_keys = {*job.resources, *job.used_resources} - try: - self._add_resources( - job_keys, self.used_resources, job.used_resources, job.resources) - except ResourcesExceeded: + async with self.resources.claim(**job.resources), \ + self.used_resources.claim(**job.used_resources): + self.jobs += 1 + await sampling_required.set(True) + for resource_key in job.resources: + try: + if job.resources[resource_key] < \ + job.used_resources[resource_key]: + if kill: + job_execution.cancel() + except KeyError: + # check is not relevant if the data is not stored + pass + self.scheduler.update_drone(self) + await job_execution.done + except ResourcesUnavailable: job_execution.cancel() - try: - # TODO: we should allow for overbooking of resources - self._add_resources( - job_keys, self.resources, job.resources, job.used_resources) - except ResourcesExceeded: + except AssertionError: job_execution.cancel() + else: + self.jobs -= 1 - for resource_key in job_keys: - try: - if job.resources[resource_key] < job.used_resources[resource_key]: - if kill: - job_execution.cancel() - else: - pass - except KeyError: - # check is not relevant if the data is not stored - pass - self.scheduler.update_drone(self) - if job_execution.status != TaskState.CANCELLED: - self.jobs += 1 - await sampling_required.set(True) - await job_execution.done if job_execution.status == TaskState.CANCELLED: - for resource_key in job_keys: + for resource_key in job.resources: usage = job.used_resources.get(resource_key, None) \ or job.resources.get(resource_key, None) value = usage / (job.resources.get(resource_key, None) @@ -182,12 +160,6 @@ async def start_job(self, job: Job, kill: bool = False): repr(job): value } }) - else: - self.jobs -= 1 - self._remove_resources( - job_keys, self.resources, job.resources, job.used_resources) - self._remove_resources( - job_keys, self.used_resources, job.used_resources, job.resources) self._utilisation = self._allocation = None self.scheduler.update_drone(self) await sampling_required.set(True) diff --git a/lapis/job.py b/lapis/job.py index d44b82b..549adf9 100644 --- a/lapis/job.py +++ b/lapis/job.py @@ -27,6 +27,12 @@ def __init__(self, resources: dict, used_resources: dict, in_queue_since: float """ self.resources = resources self.used_resources = used_resources + for key in used_resources: + if key not in resources: + logging.getLogger("implementation")\ + .info("job uses different resources than specified, added", + key, self.used_resources[key]) + self.resources[key] = self.used_resources[key] self.walltime = used_resources.pop("walltime", None) self.requested_walltime = resources.pop("walltime", None) assert self.walltime, "Job does not provide any walltime" diff --git a/lapis/monitor/general.py b/lapis/monitor/general.py index 7d49981..13a417a 100644 --- a/lapis/monitor/general.py +++ b/lapis/monitor/general.py @@ -20,17 +20,19 @@ def resource_statistics(simulator: "Simulator") -> list: """ results = [] for drone in simulator.job_scheduler.drone_list: - for resource_type in {*drone.resources, *drone.used_resources}: + resources = drone.theoretical_available_resources + used_resources = drone.available_resources + for resource_type in resources: results.append({ "resource_type": resource_type, "pool_configuration": "None", "pool_type": "drone", "pool": repr(drone), "used_ratio": - drone.used_resources.get(resource_type, 0) + used_resources.get(resource_type, 0) / drone.pool_resources.get(resource_type, 0), "requested_ratio": - drone.resources.get(resource_type, 0) + resources.get(resource_type, 0) / drone.pool_resources.get(resource_type, 0) }) return results diff --git a/lapis/scheduler.py b/lapis/scheduler.py index 2c9e283..ab5bbd0 100644 --- a/lapis/scheduler.py +++ b/lapis/scheduler.py @@ -62,8 +62,8 @@ def _add_drone(self, drone: Drone): if len(self.drone_cluster) > 0: for cluster in self.drone_cluster: current_distance = 0 - for key in {*cluster[0].theoretical_available_resources, - *drone.theoretical_available_resources}: + for key in {*cluster[0].pool_resources, + *drone.pool_resources}: current_distance += abs( cluster[0].theoretical_available_resources.get(key, 0) - drone.theoretical_available_resources.get(key, 0)) @@ -101,21 +101,22 @@ def _schedule_job(self, job) -> Drone: for cluster in self.drone_cluster: drone = cluster[0] cost = 0 - resource_types = {*drone.resources.keys(), *job.resources.keys()} + resources = drone.theoretical_available_resources + resource_types = {*resources.keys(), *job.resources.keys()} for resource_type in resource_types: - if resource_type not in drone.resources.keys(): + if resource_type not in drone.pool_resources.keys(): cost = float("Inf") break elif resource_type not in job.resources: cost += drone.pool_resources[resource_type] \ - - drone.resources[resource_type] + - resources.get(resource_type, 0) elif (drone.pool_resources[resource_type] - - drone.resources[resource_type]) < job.resources[resource_type]: + - resources.get(resource_type, 0)) < job.resources[resource_type]: cost = float("Inf") break else: cost += (drone.pool_resources[resource_type] - - drone.resources[resource_type]) // \ + - resources.get(resource_type, 0)) // \ job.resources[resource_type] cost /= len(resource_types) if cost <= 1: From 9330f2b49260224e223c4caf55f1d014bae1988d Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Fri, 6 Sep 2019 13:49:16 +0200 Subject: [PATCH 191/648] adapted current scheduling with new possibilities --- lapis/scheduler.py | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/lapis/scheduler.py b/lapis/scheduler.py index ab5bbd0..10dfcb6 100644 --- a/lapis/scheduler.py +++ b/lapis/scheduler.py @@ -102,23 +102,22 @@ def _schedule_job(self, job) -> Drone: drone = cluster[0] cost = 0 resources = drone.theoretical_available_resources - resource_types = {*resources.keys(), *job.resources.keys()} - for resource_type in resource_types: - if resource_type not in drone.pool_resources.keys(): - cost = float("Inf") - break - elif resource_type not in job.resources: - cost += drone.pool_resources[resource_type] \ - - resources.get(resource_type, 0) - elif (drone.pool_resources[resource_type] - - resources.get(resource_type, 0)) < job.resources[resource_type]: + for resource_type in job.resources: + if resources.get(resource_type, 0) < job.resources[resource_type]: + # Inf for all job resources that a drone does not support + # and all resources that are too small to even be considered cost = float("Inf") break else: - cost += (drone.pool_resources[resource_type] - - resources.get(resource_type, 0)) // \ - job.resources[resource_type] - cost /= len(resource_types) + try: + cost += 1 / (resources.get(resource_type, 0) // + job.resources[resource_type]) + except ZeroDivisionError: + pass + for additional_resource_type in [key for key in drone.pool_resources + if key not in job.resources]: + cost += resources[additional_resource_type] + cost /= len((*job.resources, *drone.pool_resources)) if cost <= 1: # directly start job return drone From 87aeea711a6c8574b6b50dc540c60acf60b3ca6f Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Fri, 6 Sep 2019 14:02:46 +0200 Subject: [PATCH 192/648] input parameters of -1 are defaulting to 0 for swf files, fixes #21 --- lapis/job_io/swf.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/lapis/job_io/swf.py b/lapis/job_io/swf.py index 0b0eb01..4d5e2d6 100644 --- a/lapis/job_io/swf.py +++ b/lapis/job_io/swf.py @@ -46,6 +46,10 @@ def swf_job_reader(iterable, resource_name_mapping={ # noqa: B006 for row in reader: resources = {} used_resources = {} + # correct request parameters + for key in ["cores", "walltime", "memory"]: + if float(row[header[resource_name_mapping[key]]]) < 0: + row[header[resource_name_mapping[key]]] = 0 for key in ["cores", "walltime"]: value = float(row[header[resource_name_mapping[key]]]) used_value = float(row[header[used_resource_name_mapping[key]]]) @@ -65,7 +69,6 @@ def swf_job_reader(iterable, resource_name_mapping={ # noqa: B006 (float(row[header[used_resource_name_mapping[key]]]) * float(row[header[used_resource_name_mapping["cores"]]])) \ * unit_conversion_mapping.get(used_resource_name_mapping[key], 1) - yield Job( resources=resources, used_resources=used_resources, From b4d2eb2d77598c3387adaffae9b83724bb8c8d8a Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Fri, 6 Sep 2019 14:10:12 +0200 Subject: [PATCH 193/648] fixed pep8 error --- lapis/scheduler.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lapis/scheduler.py b/lapis/scheduler.py index 10dfcb6..b4ea866 100644 --- a/lapis/scheduler.py +++ b/lapis/scheduler.py @@ -110,8 +110,8 @@ def _schedule_job(self, job) -> Drone: break else: try: - cost += 1 / (resources.get(resource_type, 0) // - job.resources[resource_type]) + cost += 1 / (resources.get(resource_type, 0) + // job.resources[resource_type]) except ZeroDivisionError: pass for additional_resource_type in [key for key in drone.pool_resources From d08160cc680c3a80329ed462ba61b4273ce27bbe Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Fri, 6 Sep 2019 16:38:07 +0200 Subject: [PATCH 194/648] simplified generation of dict from usim capacity resources --- lapis/drone.py | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/lapis/drone.py b/lapis/drone.py index 99c0cf6..5b825ee 100644 --- a/lapis/drone.py +++ b/lapis/drone.py @@ -48,17 +48,11 @@ def __init__(self, scheduler, pool_resources: dict, @property def theoretical_available_resources(self): - levels = self.resources.levels - return { - key: getattr(levels, key) for key in self.pool_resources - } + return dict(self.resources.levels) @property def available_resources(self): - levels = self.used_resources.levels - return { - key: getattr(levels, key) for key in self.pool_resources - } + return dict(self.used_resources.levels) async def run(self): from lapis.monitor import sampling_required From a33cb5383b5fbecbb9cfaa1f592934cfdbb2105a Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Fri, 6 Sep 2019 16:42:14 +0200 Subject: [PATCH 195/648] access to resource does not default to 0 but will give KeyError now --- lapis/scheduler.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lapis/scheduler.py b/lapis/scheduler.py index b4ea866..bf8c034 100644 --- a/lapis/scheduler.py +++ b/lapis/scheduler.py @@ -110,9 +110,9 @@ def _schedule_job(self, job) -> Drone: break else: try: - cost += 1 / (resources.get(resource_type, 0) + cost += 1 / (resources[resource_type] // job.resources[resource_type]) - except ZeroDivisionError: + except KeyError: pass for additional_resource_type in [key for key in drone.pool_resources if key not in job.resources]: From 6701cdab13921fa7c752216e9a963e02476a4852 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Fri, 6 Sep 2019 17:11:05 +0200 Subject: [PATCH 196/648] StopIteration for job generator is now properly handled --- lapis/job.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/lapis/job.py b/lapis/job.py index d44b82b..93165c7 100644 --- a/lapis/job.py +++ b/lapis/job.py @@ -96,7 +96,10 @@ async def job_to_queue_scheduler(job_generator, job_queue, **kwargs): count = 0 while True: if not job: - job = next(job_generator) + try: + job = next(job_generator) + except StopIteration: + return current_time = job.queue_date - base_date if time.now >= current_time: count += 1 From 3cd10ec4fe6b0ce190765d45f23d48a165898eba Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Fri, 6 Sep 2019 17:57:59 +0200 Subject: [PATCH 197/648] added changes to stop scheduler after all jobs have been assigned to drones --- lapis/job.py | 1 + lapis/scheduler.py | 4 ++++ 2 files changed, 5 insertions(+) diff --git a/lapis/job.py b/lapis/job.py index 7c4ec6b..0340c52 100644 --- a/lapis/job.py +++ b/lapis/job.py @@ -105,6 +105,7 @@ async def job_to_queue_scheduler(job_generator, job_queue, **kwargs): try: job = next(job_generator) except StopIteration: + await job_queue.close() return current_time = job.queue_date - base_date if time.now >= current_time: diff --git a/lapis/scheduler.py b/lapis/scheduler.py index bf8c034..1ca9f5a 100644 --- a/lapis/scheduler.py +++ b/lapis/scheduler.py @@ -36,6 +36,7 @@ def __init__(self, job_queue): self.drone_cluster = [] self.interval = 60 self.job_queue = [] + self._collecting = True @property def drone_list(self): @@ -91,10 +92,13 @@ async def run(self): scope.do(best_match.start_job(job)) await instant self.job_queue.remove(job) + if not self._collecting and len(self.job_queue) == 0: + break async def _collect_jobs(self): async for job in self._stream_queue: self.job_queue.append(job) + self._collecting = False def _schedule_job(self, job) -> Drone: priorities = {} From 387d6d64f6e83e3ce9dde88616badfdce7b4694a Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Fri, 6 Sep 2019 18:08:15 +0200 Subject: [PATCH 198/648] improved queue comparison --- lapis/scheduler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapis/scheduler.py b/lapis/scheduler.py index 1ca9f5a..b4e94fa 100644 --- a/lapis/scheduler.py +++ b/lapis/scheduler.py @@ -92,7 +92,7 @@ async def run(self): scope.do(best_match.start_job(job)) await instant self.job_queue.remove(job) - if not self._collecting and len(self.job_queue) == 0: + if not self._collecting and not self.job_queue: break async def _collect_jobs(self): From cbb7db1c4f2661e5b9f78b0d2354c9943ff2f35e Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Sat, 7 Sep 2019 22:08:37 +0200 Subject: [PATCH 199/648] changed requirements for documentation in setup.py --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 9485f2c..45bd2d9 100644 --- a/setup.py +++ b/setup.py @@ -29,7 +29,7 @@ 'click' ], extras_require={ - 'docs': ["sphinx", "sphinxcontrib-tikz"], + 'docs': ["sphinx", "sphinx_rtd_theme"], 'contrib': ['flake8', 'flake8-bugbear'] }, # metadata for package search From ee356de57a141ae44b4cf0f61ab42659b800387a Mon Sep 17 00:00:00 2001 From: Max Fischer Date: Sun, 8 Sep 2019 10:41:50 +0200 Subject: [PATCH 200/648] checking Job walltime directly --- lapis/job.py | 3 +-- lapis_tests/test_job.py | 4 ++-- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/lapis/job.py b/lapis/job.py index 0340c52..b462ceb 100644 --- a/lapis/job.py +++ b/lapis/job.py @@ -33,9 +33,8 @@ def __init__(self, resources: dict, used_resources: dict, in_queue_since: float .info("job uses different resources than specified, added", key, self.used_resources[key]) self.resources[key] = self.used_resources[key] - self.walltime = used_resources.pop("walltime", None) + self.walltime = used_resources.pop("walltime") self.requested_walltime = resources.pop("walltime", None) - assert self.walltime, "Job does not provide any walltime" self.queue_date = queue_date assert in_queue_since >= 0, "Queue time cannot be negative" self.in_queue_since = in_queue_since diff --git a/lapis_tests/test_job.py b/lapis_tests/test_job.py index 12cc65f..dcf1828 100644 --- a/lapis_tests/test_job.py +++ b/lapis_tests/test_job.py @@ -8,9 +8,9 @@ class TestJob(object): def test_init(self): - with pytest.raises(AssertionError): + with pytest.raises(KeyError): Job(resources={}, used_resources={}) - with pytest.raises(AssertionError): + with pytest.raises(KeyError): Job(resources={"walltime": 100}, used_resources={}) assert Job(resources={}, used_resources={"walltime": 100}) with pytest.raises(AssertionError): From 6db13d86af8cc016d8ddc5293b813889a34a249d Mon Sep 17 00:00:00 2001 From: Max Fischer Date: Sun, 8 Sep 2019 10:55:31 +0200 Subject: [PATCH 201/648] simplified job queueing --- lapis/job.py | 39 ++++++++++++--------------------------- 1 file changed, 12 insertions(+), 27 deletions(-) diff --git a/lapis/job.py b/lapis/job.py index b462ceb..758fd0c 100644 --- a/lapis/job.py +++ b/lapis/job.py @@ -85,35 +85,20 @@ async def run(self): } }) self._success = True - finally: - # release acquired resources - pass def __repr__(self): return '<%s: %s>' % (self.__class__.__name__, self._name or id(self)) -async def job_to_queue_scheduler(job_generator, job_queue, **kwargs): - job = next(job_generator) - base_date = job.queue_date - current_time = 0 - - count = 0 - while True: - if not job: - try: - job = next(job_generator) - except StopIteration: - await job_queue.close() - return - current_time = job.queue_date - base_date - if time.now >= current_time: - count += 1 - job.in_queue_since = time.now - await job_queue.put(job) - job = None - else: - if count > 0: - await sampling_required.set(True) - count = 0 - await (time == current_time) +async def job_to_queue_scheduler(job_generator, job_queue): + base_date = None + for job in job_generator: + if base_date is None: + base_date = job.queue_date + current_time = job.queue_date - base_date + if time.now < current_time: + await sampling_required.set(True) + await (time >= current_time) + job.in_queue_since = time.now + await job_queue.put(job) + await job_queue.close() From 8f8c105eafbd9656e346accdcfa96b0adc877625 Mon Sep 17 00:00:00 2001 From: Max Fischer Date: Sun, 8 Sep 2019 11:22:29 +0200 Subject: [PATCH 202/648] small simplification of Drone core --- lapis/drone.py | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/lapis/drone.py b/lapis/drone.py index 5b825ee..3b54cb1 100644 --- a/lapis/drone.py +++ b/lapis/drone.py @@ -1,7 +1,7 @@ import logging from cobald import interfaces -from usim import time, Scope, instant, TaskState +from usim import time, Scope, instant from usim.basics import Capacities, ResourcesUnavailable from lapis.job import Job @@ -13,14 +13,12 @@ class ResourcesExceeded(Exception): class Drone(interfaces.Pool): def __init__(self, scheduler, pool_resources: dict, - scheduling_duration: float, exclusive: bool = False, + scheduling_duration: float, ignore_resources: list = None): """ :param scheduler: :param pool_resources: :param scheduling_duration: - :param exclusive: Determines if the drone is used exclusively by jobs - in sequential order """ super(Drone, self).__init__() self.scheduler = scheduler @@ -41,7 +39,6 @@ def __init__(self, scheduler, pool_resources: dict, self.scheduler.register_drone(self) else: self._supply = 0 - self.exclusive = exclusive self.jobs = 0 self._allocation = None self._utilisation = None @@ -124,15 +121,15 @@ async def start_job(self, job: Job, kill: bool = False): self.used_resources.claim(**job.used_resources): self.jobs += 1 await sampling_required.set(True) - for resource_key in job.resources: - try: - if job.resources[resource_key] < \ - job.used_resources[resource_key]: - if kill: + if kill: + for resource_key in job.resources: + try: + if job.resources[resource_key] < \ + job.used_resources[resource_key]: job_execution.cancel() - except KeyError: - # check is not relevant if the data is not stored - pass + except KeyError: + # check is not relevant if the data is not stored + pass self.scheduler.update_drone(self) await job_execution.done except ResourcesUnavailable: @@ -142,12 +139,15 @@ async def start_job(self, job: Job, kill: bool = False): else: self.jobs -= 1 - if job_execution.status == TaskState.CANCELLED: + if not job.successful: for resource_key in job.resources: - usage = job.used_resources.get(resource_key, None) \ - or job.resources.get(resource_key, None) - value = usage / (job.resources.get(resource_key, None) - or self.pool_resources[resource_key]) + usage = job.used_resources.get( + resource_key, + job.resources.get(resource_key, None), + ) + value = usage / job.resources.get( + resource_key, self.pool_resources[resource_key] + ) if value > 1: logging.info("job_status", { "job_exceeds_%s" % resource_key: { From 3454e75bb91bd54504ffac2abf35595f0fca64a3 Mon Sep 17 00:00:00 2001 From: Max Fischer Date: Sun, 8 Sep 2019 11:50:57 +0200 Subject: [PATCH 203/648] simplified Drone management in Pool --- lapis/pool.py | 54 ++++++++++++++++++++++----------------------------- 1 file changed, 23 insertions(+), 31 deletions(-) diff --git a/lapis/pool.py b/lapis/pool.py index 0d3d931..bc4b776 100644 --- a/lapis/pool.py +++ b/lapis/pool.py @@ -1,6 +1,8 @@ from typing import Generator, Callable from cobald import interfaces -from usim import time, eternity, Scope +from usim import eternity, Scope, each + +from .drone import Drone class Pool(interfaces.Pool): @@ -14,29 +16,22 @@ class Pool(interfaces.Pool): :param name: Name of the pool :param make_drone: Callable to create a drone with specific properties for this pool """ - def __init__(self, capacity: float = float('inf'), init: float = 0, - name: str = None, make_drone: Callable = None): + def __init__(self, make_drone: Callable, + *, + capacity: int = float('inf'), + init: int = 0, + name: str = None): super(Pool, self).__init__() - assert make_drone + assert init <= capacity self.make_drone = make_drone self._drones = [] self.init_pool(init=init) self._demand = 1 - self.level = init + self._level = init self._capacity = capacity self._name = name - def put(self, amount: float): - if self.level + amount > self._capacity: - raise ValueError - self.level += amount - - def get(self, amount: float): - if self.level - amount < 0: - raise ValueError - self.level -= amount - - def init_pool(self, init: float = 0): + def init_pool(self, init: int = 0): """ Initialisation of existing drones at creation time of pool. @@ -53,30 +48,27 @@ async def run(self): initialising new drones. Otherwise drones get removed. """ async with Scope() as scope: - while True: - drones_required = self._demand - self.level + async for _ in each(interval=1): + drones_required = min(self._demand, self._capacity) - self._level while drones_required > 0: drones_required -= 1 # start a new drone drone = self.make_drone(10) scope.do(drone.run()) self._drones.append(drone) - self.put(1) - if self.level > self._demand and self.level > 1: - empty_drone_found = False + self._level += 1 + if drones_required < 0: for drone in self.drones: if drone.jobs == 0: - empty_drone_found = True - break - if empty_drone_found: - self.get(1) - self._drones.remove(drone) - scope.do(drone.shutdown()) - del drone - await (time + 1) + drones_required += 1 + self._level -= 1 + self._drones.remove(drone) + scope.do(drone.shutdown()) + if drones_required == 0: + break @property - def drones(self) -> Generator[int, None, None]: + def drones(self) -> Generator[Drone, None, None]: for drone in self._drones: if drone.supply > 0: yield drone @@ -138,7 +130,7 @@ class StaticPool(Pool): :param resources: Dictionary of resources available for each pool instantiated within the pool """ - def __init__(self, capacity: float = 0, make_drone: Callable = None): + def __init__(self, make_drone: Callable, capacity: int = 0): assert capacity > 0, "Static pool was initialised without any resources..." super(StaticPool, self).__init__(capacity=capacity, init=capacity, make_drone=make_drone) From 2a895ae3b58d9f78c39a9dd50f55ac2c622f180f Mon Sep 17 00:00:00 2001 From: Max Fischer Date: Sun, 8 Sep 2019 12:00:14 +0200 Subject: [PATCH 204/648] moved comment --- lapis/scheduler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapis/scheduler.py b/lapis/scheduler.py index b4e94fa..de86401 100644 --- a/lapis/scheduler.py +++ b/lapis/scheduler.py @@ -1,9 +1,9 @@ from usim import Scope, each, instant -# TODO: does not work anymore as there is no method get_drone at pool from lapis.drone import Drone +# TODO: does not work anymore as there is no method get_drone at pool def job_scheduler(simulator): while True: for pool in simulator.pools: From 1e4409d2e5af775d684619e2152716036622bcaf Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Sun, 8 Sep 2019 22:17:11 +0200 Subject: [PATCH 205/648] added base document for topics --- docs/source/topics/cli.rst | 0 docs/source/topics/import.rst | 0 docs/source/topics/overview.rst | 0 3 files changed, 0 insertions(+), 0 deletions(-) create mode 100644 docs/source/topics/cli.rst create mode 100644 docs/source/topics/import.rst create mode 100644 docs/source/topics/overview.rst diff --git a/docs/source/topics/cli.rst b/docs/source/topics/cli.rst new file mode 100644 index 0000000..e69de29 diff --git a/docs/source/topics/import.rst b/docs/source/topics/import.rst new file mode 100644 index 0000000..e69de29 diff --git a/docs/source/topics/overview.rst b/docs/source/topics/overview.rst new file mode 100644 index 0000000..e69de29 From b44e51a0a2e029d439d97ac20023b5d5d341a08f Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Sun, 8 Sep 2019 22:17:31 +0200 Subject: [PATCH 206/648] adapted title of documentation --- docs/index.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/index.rst b/docs/index.rst index 3b0e9d5..0cdd980 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -3,8 +3,8 @@ You can adapt this file completely to your liking, but it should at least contain the root `toctree` directive. -Welcome to lapis's documentation! -================================= +LAPIS -- Simulating and Scheduling for Opportunistic Resources +============================================================== .. toctree:: :maxdepth: 2 From 390923c5e7d4021758f8d03a2edd3f975627252d Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Tue, 10 Sep 2019 22:42:28 +0200 Subject: [PATCH 207/648] fixed implementation of used_ratio and requested_ratio --- lapis/monitor/general.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/lapis/monitor/general.py b/lapis/monitor/general.py index 13a417a..ae8f0a3 100644 --- a/lapis/monitor/general.py +++ b/lapis/monitor/general.py @@ -29,11 +29,10 @@ def resource_statistics(simulator: "Simulator") -> list: "pool_type": "drone", "pool": repr(drone), "used_ratio": - used_resources.get(resource_type, 0) - / drone.pool_resources.get(resource_type, 0), + 1 - used_resources[resource_type] + / drone.pool_resources[resource_type], "requested_ratio": - resources.get(resource_type, 0) - / drone.pool_resources.get(resource_type, 0) + 1 - resources[resource_type] / drone.pool_resources[resource_type] }) return results From 5a58054dce2e0209698254391f521dee43f454f8 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Sun, 8 Sep 2019 22:17:31 +0200 Subject: [PATCH 208/648] adapted title of documentation --- docs/index.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/index.rst b/docs/index.rst index 0cdd980..75cae10 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -3,8 +3,8 @@ You can adapt this file completely to your liking, but it should at least contain the root `toctree` directive. -LAPIS -- Simulating and Scheduling for Opportunistic Resources -============================================================== +LAPIS -- Simulations for Opportunistic Resources +================================================ .. toctree:: :maxdepth: 2 From 7d622559919ad336f81153a9f29f59b5837ab2b1 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Tue, 10 Sep 2019 22:29:22 +0200 Subject: [PATCH 209/648] added first structure for topical guide in documentation --- docs/index.rst | 3 ++- docs/source/topics/cli.rst | 2 ++ docs/source/topics/import.rst | 0 docs/source/topics/overview.rst | 10 ++++++++++ docs/source/topics/support.rst | 11 +++++++++++ 5 files changed, 25 insertions(+), 1 deletion(-) delete mode 100644 docs/source/topics/import.rst create mode 100644 docs/source/topics/support.rst diff --git a/docs/index.rst b/docs/index.rst index 75cae10..a2d7a44 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -10,7 +10,8 @@ LAPIS -- Simulations for Opportunistic Resources :maxdepth: 2 :caption: Contents: - + source/topics/overview + source/api/modules Indices and tables ================== diff --git a/docs/source/topics/cli.rst b/docs/source/topics/cli.rst index e69de29..a5281dc 100644 --- a/docs/source/topics/cli.rst +++ b/docs/source/topics/cli.rst @@ -0,0 +1,2 @@ +Command Line Interface +====================== diff --git a/docs/source/topics/import.rst b/docs/source/topics/import.rst deleted file mode 100644 index e69de29..0000000 diff --git a/docs/source/topics/overview.rst b/docs/source/topics/overview.rst index e69de29..bbf2d9f 100644 --- a/docs/source/topics/overview.rst +++ b/docs/source/topics/overview.rst @@ -0,0 +1,10 @@ +Topical Guide +============= + +This is a collection of separate topics on LAPIS. + +.. toctree:: + :maxdepth: 1 + + cli + support \ No newline at end of file diff --git a/docs/source/topics/support.rst b/docs/source/topics/support.rst new file mode 100644 index 0000000..da75f3c --- /dev/null +++ b/docs/source/topics/support.rst @@ -0,0 +1,11 @@ +Supported File Formats +====================== + +TARDIS +------ + +HTCondor +-------- + +SWF Format +---------- From 497b3de39dc13aa4b3d8664ba419252a8b2a32fa Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Wed, 11 Sep 2019 13:24:22 +0200 Subject: [PATCH 210/648] removed custom extensions from documentation configuration --- docs/conf.py | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index 931229a..69e32b4 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -196,17 +196,3 @@ # If true, `todo` and `todoList` produce output, else they produce nothing. todo_include_todos = True - -# -- Custom Extensions ------------------------------------------------------- - - -def run_apidoc(_): - """Run the `apidoc` tool to generate `autodoc` documentation for all modules""" - from sphinx.apidoc import main - output_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), 'source', 'api')) - source_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', __about__.__title__)) - main(['--module-first', '--separate', '--output-dir=' + output_dir, source_dir, '--force']) - - -def setup(app): - app.connect('builder-inited', run_apidoc) From 4b43d6376434ab367af0e68de6fc49fd97aec120 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Wed, 11 Sep 2019 13:28:54 +0200 Subject: [PATCH 211/648] Update LICENSE --- LICENSE | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/LICENSE b/LICENSE index 392dcce..a3276b5 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ MIT License -Copyright (c) 2018 Eileen Kuehn +Copyright (c) 2018 Eileen Kuehn, Max Fischer Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal From d7cc4a211661942e66a91b7065f9263914a21f70 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Wed, 11 Sep 2019 19:14:14 +0200 Subject: [PATCH 212/648] properly formatted logging string --- lapis/job.py | 6 +++--- lapis/job_io/htcondor.py | 2 +- lapis/simulator.py | 6 +++--- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/lapis/job.py b/lapis/job.py index 758fd0c..e2b597a 100644 --- a/lapis/job.py +++ b/lapis/job.py @@ -29,9 +29,9 @@ def __init__(self, resources: dict, used_resources: dict, in_queue_since: float self.used_resources = used_resources for key in used_resources: if key not in resources: - logging.getLogger("implementation")\ - .info("job uses different resources than specified, added", - key, self.used_resources[key]) + logging.getLogger("implementation").info( + "job uses different resources than specified, added %s: %s", + key, self.used_resources[key]) self.resources[key] = self.used_resources[key] self.walltime = used_resources.pop("walltime") self.requested_walltime = resources.pop("walltime", None) diff --git a/lapis/job_io/htcondor.py b/lapis/job_io/htcondor.py index 1115f53..6a8637d 100644 --- a/lapis/job_io/htcondor.py +++ b/lapis/job_io/htcondor.py @@ -31,7 +31,7 @@ def htcondor_job_reader(iterable, resource_name_mapping={ # noqa: B006 for row in htcondor_reader: if float(row[used_resource_name_mapping["walltime"]]) <= 0: logging.getLogger("implementation").warning( - "removed job from htcondor import", row) + "removed job from htcondor import (%s)", row) continue resources = {} for key, original_key in resource_name_mapping.items(): diff --git a/lapis/simulator.py b/lapis/simulator.py index 06780c2..a93caf3 100644 --- a/lapis/simulator.py +++ b/lapis/simulator.py @@ -50,11 +50,11 @@ def create_scheduler(self, scheduler_type): self.job_scheduler = scheduler_type(job_queue=self.job_queue) def run(self, until=None): - print("running until", until) + print(f"running until {until}") run(self._simulate(until)) async def _simulate(self, end): - print("Starting simulation at %s" % time.now) + print(f"Starting simulation at {time.now}") async with until(time == end) if end else Scope() as while_running: for pool in self.pools: while_running.do(pool.run()) @@ -64,7 +64,7 @@ async def _simulate(self, end): for controller in self.controllers: while_running.do(controller.run()) while_running.do(self.monitoring.run()) - print("Finished simulation at %s" % time.now) + print(f"Finished simulation at {time.now}") async def _queue_jobs(self, job_input, job_reader): await job_to_queue_scheduler(job_generator=job_reader(job_input), From c1023d891adcdb8c539ce43f051876958dfc595d Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Thu, 12 Sep 2019 11:47:56 +0200 Subject: [PATCH 213/648] documentation of output formats for simulation --- docs/source/topics/monitoring.rst | 97 +++++++++++++++++++++++++++++++ docs/source/topics/overview.rst | 3 +- 2 files changed, 99 insertions(+), 1 deletion(-) create mode 100644 docs/source/topics/monitoring.rst diff --git a/docs/source/topics/monitoring.rst b/docs/source/topics/monitoring.rst new file mode 100644 index 0000000..3673823 --- /dev/null +++ b/docs/source/topics/monitoring.rst @@ -0,0 +1,97 @@ +Monitoring Simulation Data +========================== + +Lapis provides some predefined functions that provide monitoring of relevant +information about your pools, resources, and jobs. Further, information +relevant to COBalD are provided. + +In the following you find tables summarising the available information. + +The CLI of LAPIS currently supports logging to + +* TCP, +* File, or +* Telegraf. + +See :doc:`cli` for details. + +Telegraf +-------- + +Resource Status +~~~~~~~~~~~~~~~ + +=========== ================== ============================= ======= +type name values comment +----------- ------------------ ----------------------------- ------- +measurement resource_status -- +tag tardis uuid +tag resource_type [memory | disk | cores | ...] +tag pool_configuration [``None`` | uuid] +tag pool_type [pool | drone] +tag pool uuid +field used_ratio ``float`` +field requested_ratio ``float`` +timestamp time ``float`` +=========== ================== ============================= ======= + +COBalD Status +~~~~~~~~~~~~~ + +=========== ================== ================= ============ +type name values comment +----------- ------------------ ----------------- ------------ +measurement cobald_status -- +tag tardis uuid +tag pool_configuration [``None`` | uuid] +tag pool_type [pool | drone] +tag pool uuid +field allocation ``float`` +field utilization ``float`` +field demand ``float`` +field supply ``float`` +field job_count ``int`` Running jobs +timestamp time ``float`` +=========== ================== ================= ============ + +Pool Status +~~~~~~~~~~~ + +=========== ================== ================================ ======= +type name values comment +----------- ------------------ -------------------------------- ------- +measurement system_status -- +tag tardis uuid +tag parent_pool uuid +tag pool_configuration [``None`` | uuid] +tag pool_type [pool | drone] +tag pool uuid +field status [DownState | CleanupState | ...] +timestamp time ``float`` +=========== ================== ================================ ======= + +User Demand +~~~~~~~~~~~ + +=========== =========== ========= ======= +type name values comment +----------- ----------- --------- ------- +measurement user_demand -- +tag tardis uuid +field value ``int`` +timestamp time ``float`` +=========== =========== ========= ======= + +Configuration +~~~~~~~~~~~~~ + +=========== ================== ============================= ======= +type name values comment +----------- ------------------ ----------------------------- ------- +measurement configuration -- +tag tardis uuid +tag pool_configuration uuid +tag resource_type [memory | disk | cores | ...] +field value ``float`` +timestamp time ``float`` +=========== ================== ============================= ======= diff --git a/docs/source/topics/overview.rst b/docs/source/topics/overview.rst index bbf2d9f..75232dc 100644 --- a/docs/source/topics/overview.rst +++ b/docs/source/topics/overview.rst @@ -7,4 +7,5 @@ This is a collection of separate topics on LAPIS. :maxdepth: 1 cli - support \ No newline at end of file + support + monitoring From 21590d212b985b9f1034b783cbc11b428b877962 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Thu, 24 Oct 2019 20:06:08 +0200 Subject: [PATCH 214/648] removed old job scheduler --- lapis/scheduler.py | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/lapis/scheduler.py b/lapis/scheduler.py index de86401..0b0b9ba 100644 --- a/lapis/scheduler.py +++ b/lapis/scheduler.py @@ -3,17 +3,6 @@ from lapis.drone import Drone -# TODO: does not work anymore as there is no method get_drone at pool -def job_scheduler(simulator): - while True: - for pool in simulator.pools: - while pool.level > 0 and simulator.global_demand.level > 0: - drone = yield from pool.get_drone(1) - simulator.env.process(drone.start_job(*next(simulator.job_generator))) - yield simulator.env.timeout(0) - yield simulator.env.timeout(1) - - class CondorJobScheduler(object): """ Goal of the htcondor job scheduler is to have a scheduler that somehow From df88be061425122a6a902ee3a34fd9609dbcd933 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Thu, 24 Oct 2019 21:16:39 +0200 Subject: [PATCH 215/648] added version and description to lapis module --- lapis/__about__.py | 17 -------------- lapis/__init__.py | 3 +++ setup.py | 58 ---------------------------------------------- 3 files changed, 3 insertions(+), 75 deletions(-) delete mode 100644 lapis/__about__.py delete mode 100644 setup.py diff --git a/lapis/__about__.py b/lapis/__about__.py deleted file mode 100644 index d7e7ed5..0000000 --- a/lapis/__about__.py +++ /dev/null @@ -1,17 +0,0 @@ -""" -=============================================================================== -Lapis is an adaptable, performant, and interactive scheduling (Lapis) simulator -=============================================================================== - -This is a **draft** for a scheduling simulator utilising opportunistic resources. -""" -__title__ = 'lapis' -__summary__ = 'Lapis is an adaptable, performant, and interactive scheduling ' \ - '(Lapis) simulator' -__url__ = 'https://github.com/MaineKuehn/lapis' - -__version__ = '0.1.0' -__author__ = 'Eileen Kuehn, Max Fischer' -__email__ = 'mainekuehn@gmail.com' -__copyright__ = '2019 %s' % __author__ -__keywords__ = 'opportunistic scheduling scheduler cobald simulator' diff --git a/lapis/__init__.py b/lapis/__init__.py index e69de29..a82ede9 100644 --- a/lapis/__init__.py +++ b/lapis/__init__.py @@ -0,0 +1,3 @@ +"""Lapis is an adaptable, performant, and interactive scheduling (Lapis) simulator""" + +__version__ = '0.1.0' diff --git a/setup.py b/setup.py deleted file mode 100644 index 9485f2c..0000000 --- a/setup.py +++ /dev/null @@ -1,58 +0,0 @@ -#!/usr/bin/env python -import os -from setuptools import setup, find_packages - -repo_base_dir = os.path.abspath(os.path.dirname(__file__)) -# pull in the packages metadata -package_about = {} -with open(os.path.join(repo_base_dir, "lapis", "__about__.py")) as about_file: - exec(about_file.read(), package_about) - - -with open(os.path.join(repo_base_dir, 'README.rst'), 'r') as README: - long_description = README.read() - -if __name__ == '__main__': - setup( - name=package_about['__title__'], - version=package_about['__version__'], - description=package_about['__summary__'], - long_description=long_description.strip(), - author=package_about['__author__'], - author_email=package_about['__email__'], - url=package_about['__url__'], - packages=find_packages(), - # dependencies - install_requires=[ - 'cobald', - 'usim', - 'click' - ], - extras_require={ - 'docs': ["sphinx", "sphinxcontrib-tikz"], - 'contrib': ['flake8', 'flake8-bugbear'] - }, - # metadata for package search - license='MIT', - # https://pypi.python.org/pypi?%3Aaction=list_classifiers - classifiers=[ - 'Development Status :: 2 - Pre-Alpha', - 'Intended Audience :: Developers', - 'Intended Audience :: Information Technology', - 'Intended Audience :: Science/Research', - 'Intended Audience :: System Administrators', - 'Topic :: Adaptive Technologies', - 'Topic :: Office/Business :: Scheduling', - 'Topic :: System :: Distributed Computing', - 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.4', - 'Programming Language :: Python :: 3.5', - 'Programming Language :: Python :: 3.6', - 'Programming Language :: Python :: 3.7' - ], - keywords=package_about['__keywords__'], - # unit tests - setup_requires=['pytest-runner'], - test_suite='lapis_tests', - tests_require=['pytest'], - ) From 18b39b7a18ce8cc400118c77653967aae030f90a Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Thu, 24 Oct 2019 21:17:04 +0200 Subject: [PATCH 216/648] added project toml --- pyproject.toml | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) create mode 100644 pyproject.toml diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..182a5cf --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,42 @@ +[build-system] +requires = ["flit"] +build-backend = "flit.buildapi" + +[tool.flit.metadata] +module = "lapis" +author = "Eileen Kuehn, Max Fischer" +author-email = "mainekuehn@gmail.com" +home-page = "https://github.com/MatterMiners/lapis" +description-file = "README.rst" +keywords = "htcondor simulation python cobald tardis opportunistic scheduling scheduler" +classifiers = [ + "License :: OSI Approved :: MIT License", + 'Development Status :: 2 - Pre-Alpha', + 'Intended Audience :: Developers', + 'Intended Audience :: Information Technology', + 'Intended Audience :: Science/Research', + 'Intended Audience :: System Administrators', + 'Topic :: Adaptive Technologies', + 'Topic :: Office/Business :: Scheduling', + 'Topic :: System :: Distributed Computing', + 'Programming Language :: Python :: 3', + 'Programming Language :: Python :: 3.6', + 'Programming Language :: Python :: 3.7' +] +requires = [ + "cobald", + "usim == 0.3", + "click", +] + +[tool.flit.metadata.requires-extra] +test = [ + "pytest >=4.3.0", + "flake8", + "flake8-bugbear", +] +doc = ["sphinx", "sphinx_rtd_theme"] +dev = ["pre-commit"] + +[tool.flit.metadata.urls] +Documentation = "https://lapis.readthedocs.io/en/latest/" From 84ae4ddd02e976b8ac4b1e30d240d93586c5da08 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Thu, 24 Oct 2019 21:17:30 +0200 Subject: [PATCH 217/648] adapted travis config for flit and toml usage --- .travis.yml | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/.travis.yml b/.travis.yml index ba411fd..a77b3e3 100644 --- a/.travis.yml +++ b/.travis.yml @@ -7,20 +7,17 @@ python: os: - linux # - osx # osx+python installation fails -matrix: - # ignore all but the most recent, stable releases - allow_failures: - - python: "3.7-dev" before_script: - - pip install pip --upgrade - - pip --version + - python -m pip install pip --upgrade + - python -m pip --version - export PYTHONHASHSEED=${PYTHONHASHSEED:-${RANDOM}} - echo "export PYTHONHASHSEED=${PYTHONHASHSEED}" - - pip install .[contrib] - - pip install codecov + - python -m pip install .[test] + - python -m pip install codecov - export COVERAGE_PROCESS_START=$(pwd)/.coveragerc + - export PYTEST_ADDOPTS=-v script: - python -m flake8 - - coverage run setup.py test + - python -m coverage run -m pytest after_success: - coverage report && codecov From c46edb47921331c40cb5b69f135f875e5db0a304 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Thu, 24 Oct 2019 21:18:06 +0200 Subject: [PATCH 218/648] adapted documentation config for toml and flit usage --- docs/conf.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index 931229a..eef91e1 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -17,18 +17,18 @@ # sys.path.insert(0, os.path.abspath('.')) import os import sys +import lapis sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) -from lapis import __about__ # -- Project information ----------------------------------------------------- -project = __about__.__title__ -copyright = __about__.__copyright__ -author = __about__.__author__ +project = "lapis" +author = "Eileen Kuehn, Max Fischer" +copyright = f"2019 {author}" # The short X.Y version -version = __about__.__version__ +version = lapis.__version__ # The full version, including alpha/beta/rc tags release = version @@ -204,7 +204,7 @@ def run_apidoc(_): """Run the `apidoc` tool to generate `autodoc` documentation for all modules""" from sphinx.apidoc import main output_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), 'source', 'api')) - source_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', __about__.__title__)) + source_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', "lapis")) main(['--module-first', '--separate', '--output-dir=' + output_dir, source_dir, '--force']) From 5094a7c62281ff502cda8ce780aacf3ba50a823f Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Thu, 24 Oct 2019 21:54:48 +0200 Subject: [PATCH 219/648] bumped version to 0.1.1 --- lapis/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapis/__init__.py b/lapis/__init__.py index a82ede9..a0b9cff 100644 --- a/lapis/__init__.py +++ b/lapis/__init__.py @@ -1,3 +1,3 @@ """Lapis is an adaptable, performant, and interactive scheduling (Lapis) simulator""" -__version__ = '0.1.0' +__version__ = '0.1.1' From 1394a6ce3fb7e3be542b2a1c32c8d4aa0ba70167 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Thu, 24 Oct 2019 22:04:28 +0200 Subject: [PATCH 220/648] ignore .idea folder --- .gitignore | 17 +---------------- 1 file changed, 1 insertion(+), 16 deletions(-) diff --git a/.gitignore b/.gitignore index 666c4a5..6d07713 100644 --- a/.gitignore +++ b/.gitignore @@ -27,22 +27,7 @@ $RECYCLE.BIN/ # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 # User-specific stuff: -.idea/**/workspace.xml -.idea/**/tasks.xml -.idea/dictionaries - -# Sensitive or high-churn files: -.idea/**/dataSources/ -.idea/**/dataSources.ids -.idea/**/dataSources.xml -.idea/**/dataSources.local.xml -.idea/**/sqlDataSources.xml -.idea/**/dynamic.xml -.idea/**/uiDesigner.xml - -# Gradle: -.idea/**/gradle.xml -.idea/**/libraries +.idea/ # CMake cmake-build-debug/ From 2cbf386da95911b2a2caa514515269c3e7505d16 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Thu, 24 Oct 2019 22:20:03 +0200 Subject: [PATCH 221/648] adapted dist name and documentation link --- pyproject.toml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 182a5cf..abea400 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,6 +4,7 @@ build-backend = "flit.buildapi" [tool.flit.metadata] module = "lapis" +dist-name = "lapis-sim" author = "Eileen Kuehn, Max Fischer" author-email = "mainekuehn@gmail.com" home-page = "https://github.com/MatterMiners/lapis" @@ -39,4 +40,4 @@ doc = ["sphinx", "sphinx_rtd_theme"] dev = ["pre-commit"] [tool.flit.metadata.urls] -Documentation = "https://lapis.readthedocs.io/en/latest/" +Documentation = "https://lapis-sim.readthedocs.io/en/latest/" From abc6edd6b020baf650884b0255613ae5149f8fc7 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Thu, 24 Oct 2019 23:03:41 +0200 Subject: [PATCH 222/648] removed dependency to usim.basics --- lapis/drone.py | 3 +-- lapis/simulator.py | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/lapis/drone.py b/lapis/drone.py index 3b54cb1..d217993 100644 --- a/lapis/drone.py +++ b/lapis/drone.py @@ -1,8 +1,7 @@ import logging from cobald import interfaces -from usim import time, Scope, instant -from usim.basics import Capacities, ResourcesUnavailable +from usim import time, Scope, instant, Capacities, ResourcesUnavailable from lapis.job import Job diff --git a/lapis/simulator.py b/lapis/simulator.py index 06780c2..0a66829 100644 --- a/lapis/simulator.py +++ b/lapis/simulator.py @@ -1,8 +1,7 @@ import random from functools import partial -from usim import run, time, until, Scope -from usim.basics import Queue +from usim import run, time, until, Scope, Queue from lapis.drone import Drone from lapis.job import job_to_queue_scheduler From b0d500fed25de6b8309e2f90653c6ba4c0360722 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Thu, 24 Oct 2019 23:04:26 +0200 Subject: [PATCH 223/648] removed reference to each --- lapis/monitor/__init__.py | 4 ++-- lapis/pool.py | 4 ++-- lapis/scheduler.py | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/lapis/monitor/__init__.py b/lapis/monitor/__init__.py index 6231428..01e6855 100644 --- a/lapis/monitor/__init__.py +++ b/lapis/monitor/__init__.py @@ -4,7 +4,7 @@ from typing import Callable, TYPE_CHECKING from cobald.monitor.format_json import JsonFormatter -from usim import time, Flag, each +from usim import time, Flag, delay if TYPE_CHECKING: from lapis.simulator import Simulator @@ -38,7 +38,7 @@ def __init__(self, simulator: "Simulator"): self._statistics = [] async def run(self): - async for _ in each(delay=1): + async for _ in delay(1): await sampling_required await sampling_required.set(False) for statistic in self._statistics: diff --git a/lapis/pool.py b/lapis/pool.py index bc4b776..eb4ef38 100644 --- a/lapis/pool.py +++ b/lapis/pool.py @@ -1,6 +1,6 @@ from typing import Generator, Callable from cobald import interfaces -from usim import eternity, Scope, each +from usim import eternity, Scope, interval from .drone import Drone @@ -48,7 +48,7 @@ async def run(self): initialising new drones. Otherwise drones get removed. """ async with Scope() as scope: - async for _ in each(interval=1): + async for _ in interval(1): drones_required = min(self._demand, self._capacity) - self._level while drones_required > 0: drones_required -= 1 diff --git a/lapis/scheduler.py b/lapis/scheduler.py index 0b0b9ba..278a020 100644 --- a/lapis/scheduler.py +++ b/lapis/scheduler.py @@ -1,4 +1,4 @@ -from usim import Scope, each, instant +from usim import Scope, instant, interval from lapis.drone import Drone @@ -74,7 +74,7 @@ def update_drone(self, drone: Drone): async def run(self): async with Scope() as scope: scope.do(self._collect_jobs()) - async for _ in each(interval=self.interval): + async for _ in interval(self.interval): for job in self.job_queue: best_match = self._schedule_job(job) if best_match: From ffaf2e3b29797a9a804f2458d6d5533e091e0b77 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Thu, 24 Oct 2019 23:09:32 +0200 Subject: [PATCH 224/648] adapted wrapper for running tests based on usim --- lapis_tests/__init__.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/lapis_tests/__init__.py b/lapis_tests/__init__.py index 630289d..fde813e 100644 --- a/lapis_tests/__init__.py +++ b/lapis_tests/__init__.py @@ -2,7 +2,6 @@ from functools import wraps from usim import run -from usim._core.loop import ActivityError from lapis.drone import Drone @@ -26,11 +25,7 @@ def run_test(*args, **kwargs): # https://github.com/pytest-dev/pytest/issues/1904 __tracebackhide__ = True # >>> This is not the frame you are looking for. Do read on. <<< - try: - return run(test_case(*args, **kwargs)) - except ActivityError as err: - # unwrap any exceptions - raise err.__cause__ + return run(test_case(*args, **kwargs)) return run_test From cb8050c3698dfe6e39f167a6dbd6d05f9eaad4d9 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Thu, 24 Oct 2019 23:14:01 +0200 Subject: [PATCH 225/648] bumped required version of usim to 0.4 --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index abea400..643a533 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,7 +26,7 @@ classifiers = [ ] requires = [ "cobald", - "usim == 0.3", + "usim == 0.4", "click", ] From 9478bf9804dcbfee6095b4de21a2020a18019068 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Fri, 25 Oct 2019 14:07:56 +0200 Subject: [PATCH 226/648] bumped version to 0.2.0 --- lapis/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapis/__init__.py b/lapis/__init__.py index a0b9cff..97ebbd1 100644 --- a/lapis/__init__.py +++ b/lapis/__init__.py @@ -1,3 +1,3 @@ """Lapis is an adaptable, performant, and interactive scheduling (Lapis) simulator""" -__version__ = '0.1.1' +__version__ = '0.2.0' From d7183db785017076ecc3874a4a51e9f5c4120c93 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Fri, 25 Oct 2019 23:46:48 +0200 Subject: [PATCH 227/648] added logging for refused jobs --- lapis/drone.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/lapis/drone.py b/lapis/drone.py index d217993..8988128 100644 --- a/lapis/drone.py +++ b/lapis/drone.py @@ -96,7 +96,6 @@ async def shutdown(self): self.scheduler.unregister_drone(self) await sampling_required.set(True) await (time + 1) - # print("[drone %s] has been shut down" % self) async def start_job(self, job: Job, kill: bool = False): """ @@ -139,6 +138,7 @@ async def start_job(self, job: Job, kill: bool = False): self.jobs -= 1 if not job.successful: + error_logged = False for resource_key in job.resources: usage = job.used_resources.get( resource_key, @@ -153,6 +153,13 @@ async def start_job(self, job: Job, kill: bool = False): repr(job): value } }) + error_logged = True + if not error_logged: + logging.info("job_status", { + "job_refused": { + repr(job): repr(self) + } + }) self._utilisation = self._allocation = None self.scheduler.update_drone(self) await sampling_required.set(True) From ba345278c0b24e73e94efc93323ea11f666cecc4 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Fri, 25 Oct 2019 23:50:02 +0200 Subject: [PATCH 228/648] waiting a moment when resources cannot be claimed for jobs, related to #41 --- lapis/drone.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lapis/drone.py b/lapis/drone.py index 8988128..17bb14e 100644 --- a/lapis/drone.py +++ b/lapis/drone.py @@ -113,7 +113,6 @@ async def start_job(self, job: Job, kill: bool = False): self._utilisation = self._allocation = None job_execution = scope.do(job.run()) - await instant # waiting just a moment to enable job to set parameters try: async with self.resources.claim(**job.resources), \ self.used_resources.claim(**job.used_resources): @@ -131,8 +130,10 @@ async def start_job(self, job: Job, kill: bool = False): self.scheduler.update_drone(self) await job_execution.done except ResourcesUnavailable: + await instant job_execution.cancel() except AssertionError: + await instant job_execution.cancel() else: self.jobs -= 1 From f19f03fc949150e2f6535bde225f44dc6cd7a83d Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Fri, 25 Oct 2019 23:50:58 +0200 Subject: [PATCH 229/648] using correct exception for canceled job now --- lapis/job.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lapis/job.py b/lapis/job.py index e2b597a..a5b79d5 100644 --- a/lapis/job.py +++ b/lapis/job.py @@ -1,7 +1,7 @@ import logging from usim import time -from usim import TaskCancelled +from usim import CancelTask from lapis.monitor import sampling_required @@ -73,7 +73,7 @@ async def run(self): }) try: await (time + self.walltime) - except TaskCancelled: + except CancelTask: self._success = False except BaseException: self._success = False From 161e03c74db501daa37b9c21e8176086c493472b Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Sat, 26 Oct 2019 00:03:19 +0200 Subject: [PATCH 230/648] scopes are volatile now, fixes #38 --- lapis/simulator.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/lapis/simulator.py b/lapis/simulator.py index 0a66829..584bc12 100644 --- a/lapis/simulator.py +++ b/lapis/simulator.py @@ -56,14 +56,14 @@ async def _simulate(self, end): print("Starting simulation at %s" % time.now) async with until(time == end) if end else Scope() as while_running: for pool in self.pools: - while_running.do(pool.run()) + while_running.do(pool.run(), volatile=True) for job_input, job_reader in self._job_generators: while_running.do(self._queue_jobs(job_input, job_reader)) while_running.do(self.job_scheduler.run()) for controller in self.controllers: - while_running.do(controller.run()) - while_running.do(self.monitoring.run()) - print("Finished simulation at %s" % time.now) + while_running.do(controller.run(), volatile=True) + while_running.do(self.monitoring.run(), volatile=True) + print(f"Finished simulation at {time.now}") async def _queue_jobs(self, job_input, job_reader): await job_to_queue_scheduler(job_generator=job_reader(job_input), From 8f4216129cad25352ff9d9b167a1e33e46b7a420 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Sun, 27 Oct 2019 17:06:04 +0100 Subject: [PATCH 231/648] included reference to drone in job and made success optional bool --- lapis/job.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/lapis/job.py b/lapis/job.py index a5b79d5..caf041d 100644 --- a/lapis/job.py +++ b/lapis/job.py @@ -1,17 +1,22 @@ import logging +from typing import Optional, TYPE_CHECKING from usim import time from usim import CancelTask from lapis.monitor import sampling_required +if TYPE_CHECKING: + from lapis.drone import Drone + class Job(object): __slots__ = ("resources", "used_resources", "walltime", "requested_walltime", - "queue_date", "in_queue_since", "in_queue_until", "_name", "_success") + "queue_date", "in_queue_since", "in_queue_until", "_name", "drone", + "_success") def __init__(self, resources: dict, used_resources: dict, in_queue_since: float = 0, - queue_date: float = 0, name: str = None): + queue_date: float = 0, name: str = None, drone: "Drone" = None): """ Definition of a job that uses a specified amount of resources `used_resources` over a given amount of time, `walltime`. A job is described by its user @@ -24,6 +29,7 @@ def __init__(self, resources: dict, used_resources: dict, in_queue_since: float simulation scheduler :param queue_date: Time when job was inserted into queue in real life :param name: Name of the job + :param drone: Drone where the job is running on """ self.resources = resources self.used_resources = used_resources @@ -39,15 +45,16 @@ def __init__(self, resources: dict, used_resources: dict, in_queue_since: float assert in_queue_since >= 0, "Queue time cannot be negative" self.in_queue_since = in_queue_since self.in_queue_until = None + self.drone = drone self._name = name - self._success = False + self._success: Optional[bool] = None @property def name(self) -> str: return self._name or id(self) @property - def successful(self) -> bool: + def successful(self) -> Optional[bool]: return self._success @property From 2fbab3d9992aba96b1c396d230f9d1e40a3ebb11 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Sun, 27 Oct 2019 17:14:50 +0100 Subject: [PATCH 232/648] configured implementation logger --- lapis/simulator.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/lapis/simulator.py b/lapis/simulator.py index af92dfa..0ba890f 100644 --- a/lapis/simulator.py +++ b/lapis/simulator.py @@ -1,3 +1,4 @@ +import logging import random from functools import partial @@ -11,6 +12,9 @@ from lapis.monitor.cobald import drone_statistics, pool_statistics +logging.getLogger("implementation").propagate = False + + class Simulator(object): def __init__(self, seed=1234): random.seed(seed) From ca317ed63245cb00f9ab10886fcc458ed06d3701 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Sun, 27 Oct 2019 17:15:58 +0100 Subject: [PATCH 233/648] changed monitoring flag to queue and defined statistics as dependent on specific objects to be logged --- lapis/monitor/__init__.py | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/lapis/monitor/__init__.py b/lapis/monitor/__init__.py index 01e6855..0b04f6d 100644 --- a/lapis/monitor/__init__.py +++ b/lapis/monitor/__init__.py @@ -4,7 +4,7 @@ from typing import Callable, TYPE_CHECKING from cobald.monitor.format_json import JsonFormatter -from usim import time, Flag, delay +from usim import time, Queue if TYPE_CHECKING: from lapis.simulator import Simulator @@ -29,28 +29,33 @@ def filter(self, record) -> bool: return True -sampling_required = Flag() +sampling_required = Queue() class Monitoring(object): def __init__(self, simulator: "Simulator"): self.simulator = simulator - self._statistics = [] + self._statistics = {} async def run(self): - async for _ in delay(1): - await sampling_required - await sampling_required.set(False) - for statistic in self._statistics: + async for log_object in sampling_required: + for statistic in self._statistics.get(type(log_object), set()): # do the logging - for record in statistic(self.simulator): + for record in statistic(log_object): logging.getLogger(statistic.name).info( statistic.name, record ) def register_statistic(self, statistic: Callable): assert hasattr(statistic, "name") and hasattr(statistic, "logging_formatter") - self._statistics.append(statistic) + try: + for element in statistic.whitelist: + self._statistics.setdefault(element, set()).add(statistic) + except AttributeError: + logging.getLogger("implementation").warning( + f"Removing statistic {statistic.name} as no whitelist has been defined." + ) + return # prepare the logger logger = logging.getLogger(statistic.name) From 4e7b9bccc901d85a36b43e70a8fc456f0c2cbde8 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Sun, 27 Oct 2019 17:18:07 +0100 Subject: [PATCH 234/648] adapted existing logging statistics to new format --- lapis/monitor/cobald.py | 58 +++++++++++++++----------------- lapis/monitor/general.py | 72 +++++++++++++++++++++++++--------------- 2 files changed, 72 insertions(+), 58 deletions(-) diff --git a/lapis/monitor/cobald.py b/lapis/monitor/cobald.py index 3c6c879..48b27bc 100644 --- a/lapis/monitor/cobald.py +++ b/lapis/monitor/cobald.py @@ -2,37 +2,34 @@ from cobald.monitor.format_json import JsonFormatter from cobald.monitor.format_line import LineProtocolFormatter -from typing import TYPE_CHECKING +from lapis.drone import Drone from lapis.monitor import LoggingSocketHandler, LoggingUDPSocketHandler +from lapis.pool import Pool -if TYPE_CHECKING: - from lapis.simulator import Simulator - -def drone_statistics(simulator: "Simulator") -> list: +def drone_statistics(drone: Drone) -> list: """ Collect allocation, utilisation, demand and supply of drones. - :param simulator: the simulator + :param drone: the drone :return: list of records for logging """ - results = [] - for drone in simulator.job_scheduler.drone_list: - results.append({ - "pool_configuration": "None", - "pool_type": "drone", - "pool": repr(drone), - "allocation": drone.allocation, - "utilisation": drone.utilisation, - "demand": drone.demand, - "supply": drone.supply, - "job_count": drone.jobs - }) + results = [{ + "pool_configuration": "None", + "pool_type": "drone", + "pool": repr(drone), + "allocation": drone.allocation, + "utilisation": drone.utilisation, + "demand": drone.demand, + "supply": drone.supply, + "job_count": drone.jobs + }] return results drone_statistics.name = "cobald_status" +drone_statistics.whitelist = Drone, drone_statistics.logging_formatter = { LoggingSocketHandler.__name__: JsonFormatter(), logging.StreamHandler.__name__: JsonFormatter(), @@ -43,28 +40,27 @@ def drone_statistics(simulator: "Simulator") -> list: } -def pool_statistics(simulator: "Simulator") -> list: +def pool_statistics(pool: Pool) -> list: """ Collect allocation, utilisation, demand and supply of pools. - :param simulator: the simulator + :param pool: the pool :return: list of records to log """ - results = [] - for pool in simulator.pools: - results.append({ - "pool_configuration": "None", - "pool_type": "pool", - "pool": repr(pool), - "allocation": pool.allocation, - "utilisation": pool.utilisation, - "demand": pool.demand, - "supply": pool.supply, - }) + results = [{ + "pool_configuration": "None", + "pool_type": "pool", + "pool": repr(pool), + "allocation": pool.allocation, + "utilisation": pool.utilisation, + "demand": pool.demand, + "supply": pool.supply, + }] return results pool_statistics.name = "cobald_status" +pool_statistics.whitelist = Pool, pool_statistics.logging_formatter = { LoggingSocketHandler.__name__: JsonFormatter(), logging.StreamHandler.__name__: JsonFormatter(), diff --git a/lapis/monitor/general.py b/lapis/monitor/general.py index ae8f0a3..b46a9e4 100644 --- a/lapis/monitor/general.py +++ b/lapis/monitor/general.py @@ -5,39 +5,43 @@ from cobald.monitor.format_json import JsonFormatter from cobald.monitor.format_line import LineProtocolFormatter +from lapis.drone import Drone +from lapis.job import Job from lapis.monitor import LoggingSocketHandler, LoggingUDPSocketHandler +from lapis.pool import Pool +from lapis.scheduler import CondorJobScheduler, JobQueue if TYPE_CHECKING: from lapis.simulator import Simulator -def resource_statistics(simulator: "Simulator") -> list: +def resource_statistics(drone: Drone) -> list: """ Log ratio of used and requested resources for drones. - :param simulator: the simulator + :param drone: the drone :return: list of records for logging """ results = [] - for drone in simulator.job_scheduler.drone_list: - resources = drone.theoretical_available_resources - used_resources = drone.available_resources - for resource_type in resources: - results.append({ - "resource_type": resource_type, - "pool_configuration": "None", - "pool_type": "drone", - "pool": repr(drone), - "used_ratio": - 1 - used_resources[resource_type] - / drone.pool_resources[resource_type], - "requested_ratio": - 1 - resources[resource_type] / drone.pool_resources[resource_type] - }) + resources = drone.theoretical_available_resources + used_resources = drone.available_resources + for resource_type in resources: + results.append({ + "resource_type": resource_type, + "pool_configuration": "None", + "pool_type": "drone", + "pool": repr(drone), + "used_ratio": + 1 - used_resources[resource_type] + / drone.pool_resources[resource_type], + "requested_ratio": + 1 - resources[resource_type] / drone.pool_resources[resource_type] + }) return results resource_statistics.name = "resource_status" +resource_statistics.whitelist = Drone, resource_statistics.logging_formatter = { LoggingSocketHandler.__name__: JsonFormatter(), logging.StreamHandler.__name__: JsonFormatter(), @@ -48,19 +52,21 @@ def resource_statistics(simulator: "Simulator") -> list: } -def user_demand(simulator: "Simulator") -> list: +def user_demand(job_queue: JobQueue) -> list: """ Log global user demand. - :param simulator: the simulator + :param scheduler: the scheduler :return: list of records for logging """ - return [{ - "value": len(simulator.job_scheduler.job_queue) + result = [{ + "value": len(job_queue) }] + return result user_demand.name = "user_demand" +user_demand.whitelist = JobQueue, user_demand.logging_formatter = { LoggingSocketHandler.__name__: JsonFormatter(), logging.StreamHandler.__name__: JsonFormatter(), @@ -71,22 +77,33 @@ def user_demand(simulator: "Simulator") -> list: } -def job_statistics(simulator: "Simulator") -> list: +def job_statistics(scheduler: CondorJobScheduler) -> list: """ - Log number of jobs running in a drone. + Log number of jobs running in all drones. - :param simulator: the simulator + .. Note: + + The logging is currently synchronised with the frequency of the + scheduler. If a finer resolution is required, the update of drones + can be considered additionally. + + :param scheduler: the scheduler :return: list of records for logging """ result = 0 - for drone in simulator.job_scheduler.drone_list: - result += drone.jobs + for cluster in scheduler.drone_cluster.copy(): + for drone in cluster: + result += drone.jobs return [{ + "pool_configuration": "None", + "pool_type": "obs", + "pool": repr(scheduler), "job_count": result }] job_statistics.name = "cobald_status" +job_statistics.whitelist = CondorJobScheduler, job_statistics.logging_formatter = { LoggingSocketHandler.__name__: JsonFormatter(), logging.StreamHandler.__name__: JsonFormatter(), @@ -97,7 +114,7 @@ def job_statistics(simulator: "Simulator") -> list: } -def pool_status(simulator: "Simulator") -> list: +def pool_status(pool: Pool) -> list: """ Log state changes of pools and drones. @@ -108,6 +125,7 @@ def pool_status(simulator: "Simulator") -> list: pool_status.name = "pool_status" +pool_status.whitelist = Pool, pool_status.logging_formatter = { LoggingSocketHandler.__name__: JsonFormatter(), logging.StreamHandler.__name__: JsonFormatter(), From ea879cffd0cd5d0b62ff70639dc24fc7fb1b6eef Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Sun, 27 Oct 2019 17:18:43 +0100 Subject: [PATCH 235/648] introduced method for logging job events --- lapis/monitor/general.py | 42 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/lapis/monitor/general.py b/lapis/monitor/general.py index b46a9e4..126bced 100644 --- a/lapis/monitor/general.py +++ b/lapis/monitor/general.py @@ -114,6 +114,48 @@ def job_statistics(scheduler: CondorJobScheduler) -> list: } +def job_events(job: Job) -> list: + result = { + "pool_configuration": "None", + "pool_type": "drone", + "pool": repr(job.drone), + "job": repr(job), + } + if job.successful is None: + result["queue_time"] = job.queue_date + result["waiting_time"] = job.waiting_time + elif job.successful: + result["wall_time"] = job.walltime + result["success"] = 1 + else: + result["success"] = 0 + error_logged = False + for resource_key in job.resources: + usage = job.used_resources.get( + resource_key, job.resources.get(resource_key, None)) + value = usage / job.resources.get( + resource_key, job.drone.pool_resources[resource_key] + ) + if value > 1: + result[f"exceeded_{resource_key}"] = value + error_logged = True + if not error_logged: + result["refused_by"] = repr(job.drone) + return [result] + + +job_events.name = "job_event" +job_events.whitelist = Job, +job_events.logging_formatter = { + LoggingSocketHandler.__name__: JsonFormatter(), + logging.StreamHandler.__name__: JsonFormatter(), + LoggingUDPSocketHandler.__name__: LineProtocolFormatter( + tags={"tardis", "pool_configuration", "pool_type", "pool", "job"}, + resolution=1 + ) +} + + def pool_status(pool: Pool) -> list: """ Log state changes of pools and drones. From 71acab2bd3fe352b7cc27543495e817f972a089e Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Sun, 27 Oct 2019 17:21:02 +0100 Subject: [PATCH 236/648] adapted drone logging to new method --- lapis/drone.py | 40 +++++++--------------------------------- 1 file changed, 7 insertions(+), 33 deletions(-) diff --git a/lapis/drone.py b/lapis/drone.py index 17bb14e..b2ab5dc 100644 --- a/lapis/drone.py +++ b/lapis/drone.py @@ -1,5 +1,3 @@ -import logging - from cobald import interfaces from usim import time, Scope, instant, Capacities, ResourcesUnavailable @@ -55,7 +53,7 @@ async def run(self): await (time + self.scheduling_duration) self._supply = 1 self.scheduler.register_drone(self) - await sampling_required.set(True) + await sampling_required.put(self) @property def supply(self) -> float: @@ -94,7 +92,7 @@ async def shutdown(self): from lapis.monitor import sampling_required self._supply = 0 self.scheduler.unregister_drone(self) - await sampling_required.set(True) + sampling_required.put(self) # TODO: introduce state of drone await (time + 1) async def start_job(self, job: Job, kill: bool = False): @@ -108,16 +106,17 @@ async def start_job(self, job: Job, kill: bool = False): requested resources :return: """ + job.drone = self async with Scope() as scope: from lapis.monitor import sampling_required self._utilisation = self._allocation = None job_execution = scope.do(job.run()) + self.jobs += 1 try: async with self.resources.claim(**job.resources), \ self.used_resources.claim(**job.used_resources): - self.jobs += 1 - await sampling_required.set(True) + await sampling_required.put(self) if kill: for resource_key in job.resources: try: @@ -135,35 +134,10 @@ async def start_job(self, job: Job, kill: bool = False): except AssertionError: await instant job_execution.cancel() - else: - self.jobs -= 1 - - if not job.successful: - error_logged = False - for resource_key in job.resources: - usage = job.used_resources.get( - resource_key, - job.resources.get(resource_key, None), - ) - value = usage / job.resources.get( - resource_key, self.pool_resources[resource_key] - ) - if value > 1: - logging.info("job_status", { - "job_exceeds_%s" % resource_key: { - repr(job): value - } - }) - error_logged = True - if not error_logged: - logging.info("job_status", { - "job_refused": { - repr(job): repr(self) - } - }) + self.jobs -= 1 self._utilisation = self._allocation = None self.scheduler.update_drone(self) - await sampling_required.set(True) + await sampling_required.put(self) def __repr__(self): return '<%s: %s>' % (self.__class__.__name__, id(self)) From f8ae4016c9889fd6ace159393af41a6738b1263a Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Sun, 27 Oct 2019 17:21:31 +0100 Subject: [PATCH 237/648] adapted job logging to new method --- lapis/job.py | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) diff --git a/lapis/job.py b/lapis/job.py index caf041d..096f167 100644 --- a/lapis/job.py +++ b/lapis/job.py @@ -71,13 +71,8 @@ def waiting_time(self) -> float: async def run(self): self.in_queue_until = time.now - logging.info("job_status", { - "job_queue_time": { - repr(self): self.queue_date - }, "job_waiting_time": { - repr(self): self.waiting_time - } - }) + self._success = None + await sampling_required.put(self) try: await (time + self.walltime) except CancelTask: @@ -86,12 +81,8 @@ async def run(self): self._success = False raise else: - logging.info("job_status", { - "job_wall_time": { - repr(self): self.walltime - } - }) self._success = True + await sampling_required.put(self) def __repr__(self): return '<%s: %s>' % (self.__class__.__name__, self._name or id(self)) @@ -104,7 +95,6 @@ async def job_to_queue_scheduler(job_generator, job_queue): base_date = job.queue_date current_time = job.queue_date - base_date if time.now < current_time: - await sampling_required.set(True) await (time >= current_time) job.in_queue_since = time.now await job_queue.put(job) From 8fc43965eeb84164320c4a0a3eaab7946c2eb074 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Sun, 27 Oct 2019 17:21:49 +0100 Subject: [PATCH 238/648] adapted scheduler logging to new method --- lapis/scheduler.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/lapis/scheduler.py b/lapis/scheduler.py index 278a020..ad20ea2 100644 --- a/lapis/scheduler.py +++ b/lapis/scheduler.py @@ -1,6 +1,11 @@ from usim import Scope, instant, interval from lapis.drone import Drone +from lapis.monitor import sampling_required + + +class JobQueue(list): + pass class CondorJobScheduler(object): @@ -24,7 +29,7 @@ def __init__(self, job_queue): self._stream_queue = job_queue self.drone_cluster = [] self.interval = 60 - self.job_queue = [] + self.job_queue = JobQueue() self._collecting = True @property @@ -81,12 +86,16 @@ async def run(self): scope.do(best_match.start_job(job)) await instant self.job_queue.remove(job) + await sampling_required.put(self.job_queue) if not self._collecting and not self.job_queue: break + await sampling_required.put(self) async def _collect_jobs(self): async for job in self._stream_queue: self.job_queue.append(job) + # TODO: logging happens with each job + await sampling_required.put(self.job_queue) self._collecting = False def _schedule_job(self, job) -> Drone: From f7fe520b6af7cf48e4fb24ec6018b0d491583042 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Sun, 27 Oct 2019 17:22:15 +0100 Subject: [PATCH 239/648] registered job event logging in simulation --- lapis/simulator.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lapis/simulator.py b/lapis/simulator.py index 0ba890f..fb98472 100644 --- a/lapis/simulator.py +++ b/lapis/simulator.py @@ -7,7 +7,7 @@ from lapis.drone import Drone from lapis.job import job_to_queue_scheduler from lapis.monitor.general import user_demand, job_statistics, \ - resource_statistics, pool_status, configuration_information + resource_statistics, pool_status, configuration_information, job_events from lapis.monitor import Monitoring from lapis.monitor.cobald import drone_statistics, pool_statistics @@ -32,6 +32,7 @@ def enable_monitoring(self): self.monitoring = Monitoring(self) self.monitoring.register_statistic(user_demand) self.monitoring.register_statistic(job_statistics) + self.monitoring.register_statistic(job_events) self.monitoring.register_statistic(pool_statistics) self.monitoring.register_statistic(drone_statistics) self.monitoring.register_statistic(resource_statistics) From a3938fbf6a3525bfec32a772b3e33d08984401a6 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Sun, 27 Oct 2019 17:35:07 +0100 Subject: [PATCH 240/648] adapted unit test to new logging method --- lapis_tests/utility/test_monitor.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lapis_tests/utility/test_monitor.py b/lapis_tests/utility/test_monitor.py index 6d048fa..8018ba3 100644 --- a/lapis_tests/utility/test_monitor.py +++ b/lapis_tests/utility/test_monitor.py @@ -66,7 +66,8 @@ def test_registration(self): monitoring = Monitoring(scheduler) statistics = resource_statistics monitoring.register_statistic(statistics) - assert statistics in monitoring._statistics + for element in statistics.whitelist: + assert statistics in monitoring._statistics.get(element) def test_registration_failure(self): scheduler = DummyScheduler() From c76e9ae82a703d96250cd3013a35dd5dc7cb720c Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Sun, 27 Oct 2019 17:43:34 +0100 Subject: [PATCH 241/648] added docstring for job event logging --- lapis/monitor/general.py | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/lapis/monitor/general.py b/lapis/monitor/general.py index 126bced..4291ce8 100644 --- a/lapis/monitor/general.py +++ b/lapis/monitor/general.py @@ -81,7 +81,7 @@ def job_statistics(scheduler: CondorJobScheduler) -> list: """ Log number of jobs running in all drones. - .. Note: + .. Note:: The logging is currently synchronised with the frequency of the scheduler. If a finer resolution is required, the update of drones @@ -115,6 +115,26 @@ def job_statistics(scheduler: CondorJobScheduler) -> list: def job_events(job: Job) -> list: + """ + Log relevant events for jobs. Relevant events are + + * start of a job, + * finishing of a job, either successful or not. + + Information about the start of a job are relevant to enable timely analysis + of waiting times. For finishing of jobs information about the success itself, + but also additional information on exceeded resources or refusal by the + drone are added. + + .. Warning:: + + The logging format includes the name / identifier of a job. This might + result in a huge index of the grafana database. The job is currently + included to enable better lookup and analysis of related events. + + :param job: the job to log information for + :return: list of records for logging + """ result = { "pool_configuration": "None", "pool_type": "drone", From 756d11cafe49ad4c63b35bd8f7f7828ef2241b1c Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Sun, 27 Oct 2019 17:51:30 +0100 Subject: [PATCH 242/648] removed simulator as a requirement for monitoring --- lapis/monitor/__init__.py | 3 +-- lapis/simulator.py | 2 +- lapis_tests/utility/test_monitor.py | 4 ++-- 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/lapis/monitor/__init__.py b/lapis/monitor/__init__.py index 0b04f6d..75d6274 100644 --- a/lapis/monitor/__init__.py +++ b/lapis/monitor/__init__.py @@ -33,8 +33,7 @@ def filter(self, record) -> bool: class Monitoring(object): - def __init__(self, simulator: "Simulator"): - self.simulator = simulator + def __init__(self): self._statistics = {} async def run(self): diff --git a/lapis/simulator.py b/lapis/simulator.py index fb98472..cf615a9 100644 --- a/lapis/simulator.py +++ b/lapis/simulator.py @@ -29,7 +29,7 @@ def __init__(self, seed=1234): self.enable_monitoring() def enable_monitoring(self): - self.monitoring = Monitoring(self) + self.monitoring = Monitoring() self.monitoring.register_statistic(user_demand) self.monitoring.register_statistic(job_statistics) self.monitoring.register_statistic(job_events) diff --git a/lapis_tests/utility/test_monitor.py b/lapis_tests/utility/test_monitor.py index 8018ba3..57a4e09 100644 --- a/lapis_tests/utility/test_monitor.py +++ b/lapis_tests/utility/test_monitor.py @@ -63,7 +63,7 @@ def dummy_statistics(): class TestMonitoring(object): def test_registration(self): scheduler = DummyScheduler() - monitoring = Monitoring(scheduler) + monitoring = Monitoring() statistics = resource_statistics monitoring.register_statistic(statistics) for element in statistics.whitelist: @@ -71,7 +71,7 @@ def test_registration(self): def test_registration_failure(self): scheduler = DummyScheduler() - monitoring = Monitoring(scheduler) + monitoring = Monitoring() statistics = dummy_statistics with pytest.raises(AssertionError): monitoring.register_statistic(statistics) From 1ad4cd22a44728ed0029bc1b61f0578a8b768822 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Sun, 27 Oct 2019 17:54:12 +0100 Subject: [PATCH 243/648] added docstring for monitoring class --- lapis/monitor/__init__.py | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/lapis/monitor/__init__.py b/lapis/monitor/__init__.py index 75d6274..14244d8 100644 --- a/lapis/monitor/__init__.py +++ b/lapis/monitor/__init__.py @@ -33,6 +33,12 @@ def filter(self, record) -> bool: class Monitoring(object): + """ + Enable monitoring of a simulation. Objects that change during simulation are + registered in a queue. Whenever objects in the queue become available, the + monitoring object takes care to dispatch the object to registered statistic + callables taking care to generate relevant monitoring output. + """ def __init__(self): self._statistics = {} @@ -45,7 +51,22 @@ async def run(self): statistic.name, record ) - def register_statistic(self, statistic: Callable): + def register_statistic(self, statistic: Callable) -> None: + """ + Register a callable that takes an object for logging and generates a list + of records. The callable should have the following accessible attributes: + + name: + The identifying name of the statistic for logging + logging_formatter: + Pre-defined formatters for the different supported logging formats + including socket, stream, and telegraf logging. + whitelist: + A tuple of objects the statistic callable is interested in to create + the required logging messages. + + :param statistic: Callable that returns a list of records for logging + """ assert hasattr(statistic, "name") and hasattr(statistic, "logging_formatter") try: for element in statistic.whitelist: From d7fbc21dfa9ecec5c96dc30b6f2b4adbc661ee89 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Sun, 27 Oct 2019 17:57:09 +0100 Subject: [PATCH 244/648] removed unused import --- lapis/monitor/__init__.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/lapis/monitor/__init__.py b/lapis/monitor/__init__.py index 14244d8..fb3faeb 100644 --- a/lapis/monitor/__init__.py +++ b/lapis/monitor/__init__.py @@ -1,14 +1,11 @@ import copy import logging import logging.handlers -from typing import Callable, TYPE_CHECKING +from typing import Callable from cobald.monitor.format_json import JsonFormatter from usim import time, Queue -if TYPE_CHECKING: - from lapis.simulator import Simulator - class LoggingSocketHandler(logging.handlers.SocketHandler): def makePickle(self, record): From b5411f4c3a8094de23bbab6b1bc07498ec2ecb0e Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Sun, 27 Oct 2019 17:59:49 +0100 Subject: [PATCH 245/648] removed unused imports from tests --- lapis_tests/utility/test_monitor.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/lapis_tests/utility/test_monitor.py b/lapis_tests/utility/test_monitor.py index 57a4e09..7ab6c9f 100644 --- a/lapis_tests/utility/test_monitor.py +++ b/lapis_tests/utility/test_monitor.py @@ -5,7 +5,7 @@ from cobald.monitor.format_line import LineProtocolFormatter from usim import Scope, time -from lapis_tests import via_usim, DummyScheduler +from lapis_tests import via_usim from . import make_test_logger @@ -62,7 +62,6 @@ def dummy_statistics(): class TestMonitoring(object): def test_registration(self): - scheduler = DummyScheduler() monitoring = Monitoring() statistics = resource_statistics monitoring.register_statistic(statistics) @@ -70,7 +69,6 @@ def test_registration(self): assert statistics in monitoring._statistics.get(element) def test_registration_failure(self): - scheduler = DummyScheduler() monitoring = Monitoring() statistics = dummy_statistics with pytest.raises(AssertionError): From 08d90e1f9c05046b848c8fefb974f2940da4069a Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Sun, 27 Oct 2019 18:13:53 +0100 Subject: [PATCH 246/648] added more tests for registering of statistics --- lapis_tests/utility/test_monitor.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/lapis_tests/utility/test_monitor.py b/lapis_tests/utility/test_monitor.py index 7ab6c9f..0e2cf9e 100644 --- a/lapis_tests/utility/test_monitor.py +++ b/lapis_tests/utility/test_monitor.py @@ -73,4 +73,12 @@ def test_registration_failure(self): statistics = dummy_statistics with pytest.raises(AssertionError): monitoring.register_statistic(statistics) - assert statistics not in monitoring._statistics + assert all(statistics not in stat for stat in monitoring._statistics.values()) + # define required attributes except whitelist + statistics.name = "test" + statistics.logging_formatter = {} + monitoring.register_statistic(statistics) + assert all(statistics not in stat for stat in monitoring._statistics.values()) + statistics.whitelist = str, + monitoring.register_statistic(statistics) + assert all(statistics in stat for stat in monitoring._statistics.values()) From 4b53a15ecaf3494726d12ee20a1bb40da07e8420 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Sun, 27 Oct 2019 20:56:03 +0100 Subject: [PATCH 247/648] scheduler updates available resources after job was matched --- lapis/scheduler.py | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/lapis/scheduler.py b/lapis/scheduler.py index 278a020..a9a6815 100644 --- a/lapis/scheduler.py +++ b/lapis/scheduler.py @@ -1,4 +1,5 @@ -from usim import Scope, instant, interval +from typing import Dict +from usim import Scope, interval from lapis.drone import Drone @@ -46,7 +47,7 @@ def unregister_drone(self, drone: Drone): if len(cluster) == 0: self.drone_cluster.remove(cluster) - def _add_drone(self, drone: Drone): + def _add_drone(self, drone: Drone, drone_resources: Dict = None): minimum_distance_cluster = None distance = float("Inf") if len(self.drone_cluster) > 0: @@ -54,9 +55,14 @@ def _add_drone(self, drone: Drone): current_distance = 0 for key in {*cluster[0].pool_resources, *drone.pool_resources}: - current_distance += abs( - cluster[0].theoretical_available_resources.get(key, 0) - - drone.theoretical_available_resources.get(key, 0)) + if drone_resources: + current_distance += abs( + cluster[0].theoretical_available_resources.get(key, 0) + - drone_resources.get(key, 0)) + else: + current_distance += abs( + cluster[0].theoretical_available_resources.get(key, 0) + - drone.theoretical_available_resources.get(key, 0)) if current_distance < distance: minimum_distance_cluster = cluster distance = current_distance @@ -79,8 +85,14 @@ async def run(self): best_match = self._schedule_job(job) if best_match: scope.do(best_match.start_job(job)) - await instant self.job_queue.remove(job) + self.unregister_drone(best_match) + left_resources = best_match.theoretical_available_resources + left_resources = { + key: value - job.resources.get(key, 0) for + key, value in left_resources.items() + } + self._add_drone(best_match, left_resources) if not self._collecting and not self.job_queue: break From 330655428d2538178e05e8375f1e1d34744ea239 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Sun, 27 Oct 2019 21:41:31 +0100 Subject: [PATCH 248/648] Update lapis/drone.py Co-Authored-By: Max Fischer --- lapis/drone.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapis/drone.py b/lapis/drone.py index b2ab5dc..9b33dde 100644 --- a/lapis/drone.py +++ b/lapis/drone.py @@ -92,7 +92,7 @@ async def shutdown(self): from lapis.monitor import sampling_required self._supply = 0 self.scheduler.unregister_drone(self) - sampling_required.put(self) # TODO: introduce state of drone + await sampling_required.put(self) # TODO: introduce state of drone await (time + 1) async def start_job(self, job: Job, kill: bool = False): From 9d75c7d1dc01d40ab0140be476dd1e2874041732 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Sun, 27 Oct 2019 22:22:46 +0100 Subject: [PATCH 249/648] added config for pre-commit --- .pre-commit-config.yaml | 11 +++++++++++ 1 file changed, 11 insertions(+) create mode 100644 .pre-commit-config.yaml diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..5f10048 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,11 @@ +repos: +- repo: https://github.com/pre-commit/pre-commit-hooks + rev: v2.3.0 + hooks: + - id: check-merge-conflict + - id: end-of-file-fixer + - id: flake8 +- repo: https://github.com/psf/black + rev: 19.3b0 + hooks: + - id: black From 6cee3154b7c851e0ff381ed43de033ca5275899d Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Sun, 27 Oct 2019 22:24:54 +0100 Subject: [PATCH 250/648] added configuration for black execution --- .travis.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.travis.yml b/.travis.yml index a77b3e3..f92b60a 100644 --- a/.travis.yml +++ b/.travis.yml @@ -18,6 +18,10 @@ before_script: - export PYTEST_ADDOPTS=-v script: - python -m flake8 + - | + if [[ $TRAVIS_PYTHON_VERSION != 'pypy3'* ]]; then + python -m black --target-version py36 --check src/ cobald_tests/ + fi - python -m coverage run -m pytest after_success: - coverage report && codecov From aae25f2b94c10c03e9c728760c45f7f6910ca4e7 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Sun, 27 Oct 2019 22:26:32 +0100 Subject: [PATCH 251/648] added requirement for black --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index 643a533..4ead1c6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -35,6 +35,7 @@ test = [ "pytest >=4.3.0", "flake8", "flake8-bugbear", + "black", ] doc = ["sphinx", "sphinx_rtd_theme"] dev = ["pre-commit"] From d1806dc0d4dd553a851e7846a1d909e66db7389e Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Sun, 27 Oct 2019 22:33:20 +0100 Subject: [PATCH 252/648] black formatted existing files --- docs/conf.py | 81 ++++++++++++--------- lapis/__init__.py | 2 +- lapis/cli/simulate.py | 105 ++++++++++++++++++---------- lapis/controller.py | 40 +++++++---- lapis/drone.py | 32 ++++++--- lapis/job.py | 32 +++++++-- lapis/job_io/htcondor.py | 74 ++++++++++++-------- lapis/job_io/swf.py | 65 ++++++++++------- lapis/monitor/__init__.py | 13 ++-- lapis/monitor/cobald.py | 56 ++++++++------- lapis/monitor/general.py | 82 +++++++++++----------- lapis/pool.py | 23 +++--- lapis/pool_io/htcondor.py | 39 +++++++---- lapis/pool_io/machines.py | 27 ++++--- lapis/scheduler.py | 23 +++--- lapis/simulator.py | 22 ++++-- lapis_tests/__init__.py | 4 +- lapis_tests/job_io/test_htcondor.py | 10 +-- lapis_tests/job_io/test_swf.py | 5 +- lapis_tests/test_job.py | 30 +++++--- lapis_tests/utility/__init__.py | 4 +- lapis_tests/utility/test_monitor.py | 29 ++++---- 22 files changed, 486 insertions(+), 312 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index eef91e1..ff54bc8 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -18,7 +18,8 @@ import os import sys import lapis -sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) + +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))) # -- Project information ----------------------------------------------------- @@ -43,24 +44,24 @@ # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. extensions = [ - 'sphinx.ext.autodoc', - 'sphinx.ext.intersphinx', - 'sphinx.ext.todo', - 'sphinx.ext.imgmath', - 'sphinx.ext.viewcode', + "sphinx.ext.autodoc", + "sphinx.ext.intersphinx", + "sphinx.ext.todo", + "sphinx.ext.imgmath", + "sphinx.ext.viewcode", ] # Add any paths that contain templates here, relative to this directory. -templates_path = ['_templates'] +templates_path = ["_templates"] # The suffix(es) of source filenames. # You can specify multiple suffix as a list of string: # # source_suffix = ['.rst', '.md'] -source_suffix = '.rst' +source_suffix = ".rst" # The master toctree document. -master_doc = 'index' +master_doc = "index" # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. @@ -72,7 +73,7 @@ # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. # This pattern also affects html_static_path and html_extra_path. -exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] +exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] # The name of the Pygments (syntax highlighting) style to use. pygments_style = None @@ -83,7 +84,7 @@ # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. # -html_theme = 'sphinx_rtd_theme' +html_theme = "sphinx_rtd_theme" # Theme options are theme-specific and customize the look and feel of a theme # further. For a list of options available for each theme, see the @@ -94,7 +95,7 @@ # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ['_static'] +html_static_path = ["_static"] # Custom sidebar templates, must be a dictionary that maps document names # to template names. @@ -110,7 +111,7 @@ # -- Options for HTMLHelp output --------------------------------------------- # Output file base name for HTML help builder. -htmlhelp_basename = 'lapisdoc' +htmlhelp_basename = "lapisdoc" # -- Options for LaTeX output ------------------------------------------------ @@ -119,15 +120,12 @@ # The paper size ('letterpaper' or 'a4paper'). # # 'papersize': 'letterpaper', - # The font size ('10pt', '11pt' or '12pt'). # # 'pointsize': '10pt', - # Additional stuff for the LaTeX preamble. # # 'preamble': '', - # Latex figure (float) alignment # # 'figure_align': 'htbp', @@ -137,8 +135,13 @@ # (source start file, target name, title, # author, documentclass [howto, manual, or own class]). latex_documents = [ - (master_doc, 'lapis.tex', 'lapis Documentation', - 'Eileen Kuehn, Max Fischer', 'manual'), + ( + master_doc, + "lapis.tex", + "lapis Documentation", + "Eileen Kuehn, Max Fischer", + "manual", + ) ] @@ -146,10 +149,7 @@ # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). -man_pages = [ - (master_doc, 'lapis', 'lapis Documentation', - [author], 1) -] +man_pages = [(master_doc, "lapis", "lapis Documentation", [author], 1)] # -- Options for Texinfo output ---------------------------------------------- @@ -158,9 +158,15 @@ # (source start file, target name, title, author, # dir menu entry, description, category) texinfo_documents = [ - (master_doc, 'lapis', 'lapis Documentation', - author, 'lapis', 'One line description of project.', - 'Miscellaneous'), + ( + master_doc, + "lapis", + "lapis Documentation", + author, + "lapis", + "One line description of project.", + "Miscellaneous", + ) ] @@ -179,7 +185,7 @@ # epub_uid = '' # A list of files that should not be packed into the epub file. -epub_exclude_files = ['search.html'] +epub_exclude_files = ["search.html"] # -- Extension configuration ------------------------------------------------- @@ -188,8 +194,8 @@ # Example configuration for intersphinx: refer to the Python standard library. intersphinx_mapping = { - "python": ('https://docs.python.org/3', None), - "usim": ('https://usim.readthedocs.io/en/stable', None), + "python": ("https://docs.python.org/3", None), + "usim": ("https://usim.readthedocs.io/en/stable", None), } # -- Options for todo extension ---------------------------------------------- @@ -203,10 +209,21 @@ def run_apidoc(_): """Run the `apidoc` tool to generate `autodoc` documentation for all modules""" from sphinx.apidoc import main - output_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), 'source', 'api')) - source_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', "lapis")) - main(['--module-first', '--separate', '--output-dir=' + output_dir, source_dir, '--force']) + + output_dir = os.path.abspath( + os.path.join(os.path.dirname(__file__), "source", "api") + ) + source_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "lapis")) + main( + [ + "--module-first", + "--separate", + "--output-dir=" + output_dir, + source_dir, + "--force", + ] + ) def setup(app): - app.connect('builder-inited', run_apidoc) + app.connect("builder-inited", run_apidoc) diff --git a/lapis/__init__.py b/lapis/__init__.py index 97ebbd1..8e1bb65 100644 --- a/lapis/__init__.py +++ b/lapis/__init__.py @@ -1,3 +1,3 @@ """Lapis is an adaptable, performant, and interactive scheduling (Lapis) simulator""" -__version__ = '0.2.0' +__version__ = "0.2.0" diff --git a/lapis/cli/simulate.py b/lapis/cli/simulate.py index 2c954ad..aa176ee 100644 --- a/lapis/cli/simulate.py +++ b/lapis/cli/simulate.py @@ -13,19 +13,17 @@ from lapis.scheduler import CondorJobScheduler from lapis.simulator import Simulator -from lapis.monitor import LoggingSocketHandler, LoggingUDPSocketHandler, \ - SimulationTimeFilter +from lapis.monitor import ( + LoggingSocketHandler, + LoggingUDPSocketHandler, + SimulationTimeFilter, +) last_step = 0 -job_import_mapper = { - "htcondor": htcondor_job_reader, - "swf": swf_job_reader -} +job_import_mapper = {"htcondor": htcondor_job_reader, "swf": swf_job_reader} -pool_import_mapper = { - "htcondor": htcondor_pool_reader -} +pool_import_mapper = {"htcondor": htcondor_pool_reader} @click.group() @@ -37,15 +35,16 @@ @click.pass_context def cli(ctx, seed, until, log_tcp, log_file, log_telegraf): ctx.ensure_object(dict) - ctx.obj['seed'] = seed - ctx.obj['until'] = until + ctx.obj["seed"] = seed + ctx.obj["until"] = until monitoring_logger = logging.getLogger() monitoring_logger.setLevel(logging.DEBUG) time_filter = SimulationTimeFilter() monitoring_logger.addFilter(time_filter) if log_tcp: socketHandler = LoggingSocketHandler( - 'localhost', logging.handlers.DEFAULT_TCP_LOGGING_PORT) + "localhost", logging.handlers.DEFAULT_TCP_LOGGING_PORT + ) socketHandler.setFormatter(JsonFormatter()) monitoring_logger.addHandler(socketHandler) if log_file: @@ -54,45 +53,63 @@ def cli(ctx, seed, until, log_tcp, log_file, log_telegraf): monitoring_logger.addHandler(streamHandler) if log_telegraf: telegrafHandler = LoggingUDPSocketHandler( - "localhost", logging.handlers.DEFAULT_UDP_LOGGING_PORT) + "localhost", logging.handlers.DEFAULT_UDP_LOGGING_PORT + ) telegrafHandler.setFormatter(LineProtocolFormatter(resolution=1)) monitoring_logger.addHandler(telegrafHandler) @cli.command() -@click.option("--job-file", "job_file", type=( - click.File("r"), click.Choice(list(job_import_mapper.keys())))) -@click.option("--pool-file", "pool_file", type=( - click.File("r"), click.Choice(list(pool_import_mapper.keys()))), multiple=True) +@click.option( + "--job-file", + "job_file", + type=(click.File("r"), click.Choice(list(job_import_mapper.keys()))), +) +@click.option( + "--pool-file", + "pool_file", + type=(click.File("r"), click.Choice(list(pool_import_mapper.keys()))), + multiple=True, +) @click.pass_context def static(ctx, job_file, pool_file): click.echo("starting static environment") simulator = Simulator(seed=ctx.obj["seed"]) file, file_type = job_file simulator.create_job_generator( - job_input=file, job_reader=job_import_mapper[file_type]) + job_input=file, job_reader=job_import_mapper[file_type] + ) simulator.create_scheduler(scheduler_type=CondorJobScheduler) for current_pool in pool_file: pool_file, pool_file_type = current_pool simulator.create_pools( pool_input=pool_file, pool_reader=pool_import_mapper[pool_file_type], - pool_type=StaticPool) + pool_type=StaticPool, + ) simulator.run(until=ctx.obj["until"]) @cli.command() -@click.option("--job-file", "job_file", type=( - click.File("r"), click.Choice(list(job_import_mapper.keys())))) -@click.option("--pool-file", "pool_file", type=( - click.File("r"), click.Choice(list(pool_import_mapper.keys()))), multiple=True) +@click.option( + "--job-file", + "job_file", + type=(click.File("r"), click.Choice(list(job_import_mapper.keys()))), +) +@click.option( + "--pool-file", + "pool_file", + type=(click.File("r"), click.Choice(list(pool_import_mapper.keys()))), + multiple=True, +) @click.pass_context def dynamic(ctx, job_file, pool_file): click.echo("starting dynamic environment") simulator = Simulator(seed=ctx.obj["seed"]) file, file_type = job_file simulator.create_job_generator( - job_input=file, job_reader=job_import_mapper[file_type]) + job_input=file, job_reader=job_import_mapper[file_type] + ) simulator.create_scheduler(scheduler_type=CondorJobScheduler) for current_pool in pool_file: file, file_type = current_pool @@ -100,39 +117,55 @@ def dynamic(ctx, job_file, pool_file): pool_input=file, pool_reader=pool_import_mapper[file_type], pool_type=Pool, - controller=SimulatedLinearController) + controller=SimulatedLinearController, + ) simulator.run(until=ctx.obj["until"]) @cli.command() -@click.option("--job-file", "job_file", type=( - click.File("r"), click.Choice(list(job_import_mapper.keys())))) -@click.option("--static-pool-file", "static_pool_file", type=( - click.File("r"), click.Choice(list(pool_import_mapper.keys()))), multiple=True) -@click.option("--dynamic-pool-file", "dynamic_pool_file", type=( - click.File("r"), click.Choice(list(pool_import_mapper.keys()))), multiple=True) +@click.option( + "--job-file", + "job_file", + type=(click.File("r"), click.Choice(list(job_import_mapper.keys()))), +) +@click.option( + "--static-pool-file", + "static_pool_file", + type=(click.File("r"), click.Choice(list(pool_import_mapper.keys()))), + multiple=True, +) +@click.option( + "--dynamic-pool-file", + "dynamic_pool_file", + type=(click.File("r"), click.Choice(list(pool_import_mapper.keys()))), + multiple=True, +) @click.pass_context def hybrid(ctx, job_file, static_pool_file, dynamic_pool_file): click.echo("starting hybrid environment") simulator = Simulator(seed=ctx.obj["seed"]) file, file_type = job_file simulator.create_job_generator( - job_input=file, job_reader=job_import_mapper[file_type]) + job_input=file, job_reader=job_import_mapper[file_type] + ) simulator.create_scheduler(scheduler_type=CondorJobScheduler) for current_pool in static_pool_file: file, file_type = current_pool simulator.create_pools( - pool_input=file, pool_reader=pool_import_mapper[file_type], - pool_type=StaticPool) + pool_input=file, + pool_reader=pool_import_mapper[file_type], + pool_type=StaticPool, + ) for current_pool in dynamic_pool_file: file, file_type = current_pool simulator.create_pools( pool_input=file, pool_reader=pool_import_mapper[file_type], pool_type=Pool, - controller=SimulatedLinearController) + controller=SimulatedLinearController, + ) simulator.run(until=ctx.obj["until"]) -if __name__ == '__main__': +if __name__ == "__main__": cli() diff --git a/lapis/controller.py b/lapis/controller.py index 32681bf..9a44b46 100644 --- a/lapis/controller.py +++ b/lapis/controller.py @@ -5,10 +5,12 @@ class SimulatedLinearController(LinearController): - def __init__(self, target: Pool, low_utilisation=0.5, high_allocation=0.5, - rate=1, interval=1): + def __init__( + self, target: Pool, low_utilisation=0.5, high_allocation=0.5, rate=1, interval=1 + ): super(SimulatedLinearController, self).__init__( - target, low_utilisation, high_allocation, rate, interval) + target, low_utilisation, high_allocation, rate, interval + ) async def run(self): while True: @@ -17,13 +19,23 @@ async def run(self): class SimulatedRelativeSupplyController(RelativeSupplyController): - def __init__(self, target: Pool, low_utilisation=0.5, high_allocation=0.5, - low_scale=0.9, high_scale=1.1, - interval=1): + def __init__( + self, + target: Pool, + low_utilisation=0.5, + high_allocation=0.5, + low_scale=0.9, + high_scale=1.1, + interval=1, + ): super(SimulatedRelativeSupplyController, self).__init__( - target=target, low_utilisation=low_utilisation, - high_allocation=high_allocation, low_scale=low_scale, - high_scale=high_scale, interval=interval) + target=target, + low_utilisation=low_utilisation, + high_allocation=high_allocation, + low_scale=low_scale, + high_scale=high_scale, + interval=interval, + ) async def run(self): while True: @@ -32,18 +44,20 @@ async def run(self): class SimulatedCostController(SimulatedLinearController): - def __init__(self, target: Pool, low_utilisation=0.5, high_allocation=0.5, - rate=1, interval=1): + def __init__( + self, target: Pool, low_utilisation=0.5, high_allocation=0.5, rate=1, interval=1 + ): self.current_cost = 1 super(SimulatedCostController, self).__init__( - target, low_utilisation, high_allocation, rate, interval) + target, low_utilisation, high_allocation, rate, interval + ) def regulate(self, interval): allocation = 0 for drone in self.target.drones: allocation += drone.allocation if self.target.supply - allocation <= 1: - if self.target.utilisation >= .8: + if self.target.utilisation >= 0.8: self.target.demand = int(allocation + self.current_cost) self.current_cost += 1 else: diff --git a/lapis/drone.py b/lapis/drone.py index 9b33dde..b6d9cab 100644 --- a/lapis/drone.py +++ b/lapis/drone.py @@ -9,9 +9,13 @@ class ResourcesExceeded(Exception): class Drone(interfaces.Pool): - def __init__(self, scheduler, pool_resources: dict, - scheduling_duration: float, - ignore_resources: list = None): + def __init__( + self, + scheduler, + pool_resources: dict, + scheduling_duration: float, + ignore_resources: list = None, + ): """ :param scheduler: :param pool_resources: @@ -25,7 +29,8 @@ def __init__(self, scheduler, pool_resources: dict, self.used_resources = Capacities(**pool_resources) if ignore_resources: self._valid_resource_keys = [ - resource for resource in self.pool_resources + resource + for resource in self.pool_resources if resource not in ignore_resources ] else: @@ -50,6 +55,7 @@ def available_resources(self): async def run(self): from lapis.monitor import sampling_required + await (time + self.scheduling_duration) self._supply = 1 self.scheduler.register_drone(self) @@ -84,12 +90,14 @@ def _init_allocation_and_utilisation(self): resources = [] for resource_key in self._valid_resource_keys: resources.append( - getattr(levels, resource_key) / self.pool_resources[resource_key]) + getattr(levels, resource_key) / self.pool_resources[resource_key] + ) self._allocation = max(resources) self._utilisation = min(resources) async def shutdown(self): from lapis.monitor import sampling_required + self._supply = 0 self.scheduler.unregister_drone(self) await sampling_required.put(self) # TODO: introduce state of drone @@ -109,19 +117,23 @@ async def start_job(self, job: Job, kill: bool = False): job.drone = self async with Scope() as scope: from lapis.monitor import sampling_required + self._utilisation = self._allocation = None job_execution = scope.do(job.run()) self.jobs += 1 try: - async with self.resources.claim(**job.resources), \ - self.used_resources.claim(**job.used_resources): + async with self.resources.claim( + **job.resources + ), self.used_resources.claim(**job.used_resources): await sampling_required.put(self) if kill: for resource_key in job.resources: try: - if job.resources[resource_key] < \ - job.used_resources[resource_key]: + if ( + job.resources[resource_key] + < job.used_resources[resource_key] + ): job_execution.cancel() except KeyError: # check is not relevant if the data is not stored @@ -140,4 +152,4 @@ async def start_job(self, job: Job, kill: bool = False): await sampling_required.put(self) def __repr__(self): - return '<%s: %s>' % (self.__class__.__name__, id(self)) + return "<%s: %s>" % (self.__class__.__name__, id(self)) diff --git a/lapis/job.py b/lapis/job.py index 096f167..3e722c2 100644 --- a/lapis/job.py +++ b/lapis/job.py @@ -11,12 +11,28 @@ class Job(object): - __slots__ = ("resources", "used_resources", "walltime", "requested_walltime", - "queue_date", "in_queue_since", "in_queue_until", "_name", "drone", - "_success") + __slots__ = ( + "resources", + "used_resources", + "walltime", + "requested_walltime", + "queue_date", + "in_queue_since", + "in_queue_until", + "_name", + "drone", + "_success", + ) - def __init__(self, resources: dict, used_resources: dict, in_queue_since: float = 0, - queue_date: float = 0, name: str = None, drone: "Drone" = None): + def __init__( + self, + resources: dict, + used_resources: dict, + in_queue_since: float = 0, + queue_date: float = 0, + name: str = None, + drone: "Drone" = None, + ): """ Definition of a job that uses a specified amount of resources `used_resources` over a given amount of time, `walltime`. A job is described by its user @@ -37,7 +53,9 @@ def __init__(self, resources: dict, used_resources: dict, in_queue_since: float if key not in resources: logging.getLogger("implementation").info( "job uses different resources than specified, added %s: %s", - key, self.used_resources[key]) + key, + self.used_resources[key], + ) self.resources[key] = self.used_resources[key] self.walltime = used_resources.pop("walltime") self.requested_walltime = resources.pop("walltime", None) @@ -85,7 +103,7 @@ async def run(self): await sampling_required.put(self) def __repr__(self): - return '<%s: %s>' % (self.__class__.__name__, self._name or id(self)) + return "<%s: %s>" % (self.__class__.__name__, self._name or id(self)) async def job_to_queue_scheduler(job_generator, job_queue): diff --git a/lapis/job_io/htcondor.py b/lapis/job_io/htcondor.py index 6a8637d..09bd7e1 100644 --- a/lapis/job_io/htcondor.py +++ b/lapis/job_io/htcondor.py @@ -4,52 +4,64 @@ from lapis.job import Job -def htcondor_job_reader(iterable, resource_name_mapping={ # noqa: B006 - "cores": "RequestCpus", - "walltime": "RequestWalltime", # s - "memory": "RequestMemory", # MiB - "disk": "RequestDisk" # KiB -}, used_resource_name_mapping={ # noqa: B006 - "queuetime": "QDate", - "walltime": "RemoteWallClockTime", # s - "cores": "Number of Allocated Processors", - "memory": "MemoryUsage", # MB - "disk": "DiskUsage_RAW" # KiB -}, unit_conversion_mapping={ # noqa: B006 - "RequestCpus": 1, - "RequestWalltime": 1, - "RequestMemory": 1.024 / 1024, - "RequestDisk": 1.024 / 1024 / 1024, - "queuetime": 1, - "RemoteWallClockTime": 1, - "Number of Allocated Processors": 1, - "MemoryUsage": 1 / 1024, - "DiskUsage_RAW": 1.024 / 1024 / 1024 -}): - htcondor_reader = csv.DictReader(iterable, delimiter=' ', quotechar="'") +def htcondor_job_reader( + iterable, + resource_name_mapping={ # noqa: B006 + "cores": "RequestCpus", + "walltime": "RequestWalltime", # s + "memory": "RequestMemory", # MiB + "disk": "RequestDisk", # KiB + }, + used_resource_name_mapping={ # noqa: B006 + "queuetime": "QDate", + "walltime": "RemoteWallClockTime", # s + "cores": "Number of Allocated Processors", + "memory": "MemoryUsage", # MB + "disk": "DiskUsage_RAW", # KiB + }, + unit_conversion_mapping={ # noqa: B006 + "RequestCpus": 1, + "RequestWalltime": 1, + "RequestMemory": 1.024 / 1024, + "RequestDisk": 1.024 / 1024 / 1024, + "queuetime": 1, + "RemoteWallClockTime": 1, + "Number of Allocated Processors": 1, + "MemoryUsage": 1 / 1024, + "DiskUsage_RAW": 1.024 / 1024 / 1024, + }, +): + htcondor_reader = csv.DictReader(iterable, delimiter=" ", quotechar="'") for row in htcondor_reader: if float(row[used_resource_name_mapping["walltime"]]) <= 0: logging.getLogger("implementation").warning( - "removed job from htcondor import (%s)", row) + "removed job from htcondor import (%s)", row + ) continue resources = {} for key, original_key in resource_name_mapping.items(): try: - resources[key] = float(row[original_key]) \ - * unit_conversion_mapping.get(original_key, 1) + resources[key] = float(row[original_key]) * unit_conversion_mapping.get( + original_key, 1 + ) except ValueError: pass used_resources = { - "cores": (float(row["RemoteSysCpu"]) + float(row["RemoteUserCpu"]) - / float(row[used_resource_name_mapping["walltime"]])) + "cores": ( + float(row["RemoteSysCpu"]) + + float(row["RemoteUserCpu"]) + / float(row[used_resource_name_mapping["walltime"]]) + ) * unit_conversion_mapping.get(used_resource_name_mapping[key], 1) } for key in ["memory", "walltime", "disk"]: original_key = used_resource_name_mapping[key] - used_resources[key] = \ - float(row[original_key]) * unit_conversion_mapping.get(original_key, 1) + used_resources[key] = float( + row[original_key] + ) * unit_conversion_mapping.get(original_key, 1) yield Job( resources=resources, used_resources=used_resources, - queue_date=float(row[used_resource_name_mapping["queuetime"]])) + queue_date=float(row[used_resource_name_mapping["queuetime"]]), + ) diff --git a/lapis/job_io/swf.py b/lapis/job_io/swf.py index 4d5e2d6..baf45df 100644 --- a/lapis/job_io/swf.py +++ b/lapis/job_io/swf.py @@ -8,19 +8,24 @@ from lapis.job import Job -def swf_job_reader(iterable, resource_name_mapping={ # noqa: B006 - "cores": "Requested Number of Processors", - "walltime": "Requested Time", - "memory": "Requested Memory" -}, used_resource_name_mapping={ # noqa: B006 - "walltime": "Run Time", - "cores": "Number of Allocated Processors", - "memory": "Used Memory", - "queuetime": "Submit Time" -}, unit_conversion_mapping={ # noqa: B006 - "Used Memory": 1 / 1024 / 1024, - "Requested Memory": 1 / 2114 / 1024 -}): +def swf_job_reader( + iterable, + resource_name_mapping={ # noqa: B006 + "cores": "Requested Number of Processors", + "walltime": "Requested Time", + "memory": "Requested Memory", + }, + used_resource_name_mapping={ # noqa: B006 + "walltime": "Run Time", + "cores": "Number of Allocated Processors", + "memory": "Used Memory", + "queuetime": "Submit Time", + }, + unit_conversion_mapping={ # noqa: B006 + "Used Memory": 1 / 1024 / 1024, + "Requested Memory": 1 / 2114 / 1024, + }, +): header = { "Job Number": 0, "Submit Time": 1, @@ -39,10 +44,13 @@ def swf_job_reader(iterable, resource_name_mapping={ # noqa: B006 "Queue Number": 14, "Partition Number": 15, "Preceding Job Number": 16, - "Think Time from Preceding Job": 17 # s + "Think Time from Preceding Job": 17, # s } - reader = csv.reader((line for line in iterable if line[0] != ';'), - delimiter=' ', skipinitialspace=True) + reader = csv.reader( + (line for line in iterable if line[0] != ";"), + delimiter=" ", + skipinitialspace=True, + ) for row in reader: resources = {} used_resources = {} @@ -55,22 +63,25 @@ def swf_job_reader(iterable, resource_name_mapping={ # noqa: B006 used_value = float(row[header[used_resource_name_mapping[key]]]) if value >= 0: resources[key] = value * unit_conversion_mapping.get( - resource_name_mapping[key], 1) + resource_name_mapping[key], 1 + ) if used_value >= 0: used_resources[key] = used_value * unit_conversion_mapping.get( - used_resource_name_mapping[key], 1) + used_resource_name_mapping[key], 1 + ) # handle memory key = "memory" - resources[key] = \ - (float(row[header[resource_name_mapping[key]]]) - * float(row[header[resource_name_mapping["cores"]]])) \ - * unit_conversion_mapping.get(resource_name_mapping[key], 1) - used_resources[key] = \ - (float(row[header[used_resource_name_mapping[key]]]) - * float(row[header[used_resource_name_mapping["cores"]]])) \ - * unit_conversion_mapping.get(used_resource_name_mapping[key], 1) + resources[key] = ( + float(row[header[resource_name_mapping[key]]]) + * float(row[header[resource_name_mapping["cores"]]]) + ) * unit_conversion_mapping.get(resource_name_mapping[key], 1) + used_resources[key] = ( + float(row[header[used_resource_name_mapping[key]]]) + * float(row[header[used_resource_name_mapping["cores"]]]) + ) * unit_conversion_mapping.get(used_resource_name_mapping[key], 1) yield Job( resources=resources, used_resources=used_resources, queue_date=float(row[header[used_resource_name_mapping["queuetime"]]]), - name=row[header["Job Number"]]) + name=row[header["Job Number"]], + ) diff --git a/lapis/monitor/__init__.py b/lapis/monitor/__init__.py index fb3faeb..c7e4039 100644 --- a/lapis/monitor/__init__.py +++ b/lapis/monitor/__init__.py @@ -21,6 +21,7 @@ class SimulationTimeFilter(logging.Filter): """ Dummy filter to replace log record timestamp with simulation time. """ + def filter(self, record) -> bool: record.created = time.now return True @@ -36,6 +37,7 @@ class Monitoring(object): monitoring object takes care to dispatch the object to registered statistic callables taking care to generate relevant monitoring output. """ + def __init__(self): self._statistics = {} @@ -44,9 +46,7 @@ async def run(self): for statistic in self._statistics.get(type(log_object), set()): # do the logging for record in statistic(log_object): - logging.getLogger(statistic.name).info( - statistic.name, record - ) + logging.getLogger(statistic.name).info(statistic.name, record) def register_statistic(self, statistic: Callable) -> None: """ @@ -83,6 +83,9 @@ def register_statistic(self, statistic: Callable) -> None: root_logger = logging.getLogger() for handler in root_logger.handlers: new_handler = copy.copy(handler) - new_handler.setFormatter(statistic.logging_formatter.get( - type(handler).__name__, JsonFormatter())) + new_handler.setFormatter( + statistic.logging_formatter.get( + type(handler).__name__, JsonFormatter() + ) + ) logger.addHandler(new_handler) diff --git a/lapis/monitor/cobald.py b/lapis/monitor/cobald.py index 48b27bc..c609d30 100644 --- a/lapis/monitor/cobald.py +++ b/lapis/monitor/cobald.py @@ -15,28 +15,29 @@ def drone_statistics(drone: Drone) -> list: :param drone: the drone :return: list of records for logging """ - results = [{ - "pool_configuration": "None", - "pool_type": "drone", - "pool": repr(drone), - "allocation": drone.allocation, - "utilisation": drone.utilisation, - "demand": drone.demand, - "supply": drone.supply, - "job_count": drone.jobs - }] + results = [ + { + "pool_configuration": "None", + "pool_type": "drone", + "pool": repr(drone), + "allocation": drone.allocation, + "utilisation": drone.utilisation, + "demand": drone.demand, + "supply": drone.supply, + "job_count": drone.jobs, + } + ] return results drone_statistics.name = "cobald_status" -drone_statistics.whitelist = Drone, +drone_statistics.whitelist = (Drone,) drone_statistics.logging_formatter = { LoggingSocketHandler.__name__: JsonFormatter(), logging.StreamHandler.__name__: JsonFormatter(), LoggingUDPSocketHandler.__name__: LineProtocolFormatter( - tags={"tardis", "pool_configuration", "pool_type", "pool"}, - resolution=1 - ) + tags={"tardis", "pool_configuration", "pool_type", "pool"}, resolution=1 + ), } @@ -47,25 +48,26 @@ def pool_statistics(pool: Pool) -> list: :param pool: the pool :return: list of records to log """ - results = [{ - "pool_configuration": "None", - "pool_type": "pool", - "pool": repr(pool), - "allocation": pool.allocation, - "utilisation": pool.utilisation, - "demand": pool.demand, - "supply": pool.supply, - }] + results = [ + { + "pool_configuration": "None", + "pool_type": "pool", + "pool": repr(pool), + "allocation": pool.allocation, + "utilisation": pool.utilisation, + "demand": pool.demand, + "supply": pool.supply, + } + ] return results pool_statistics.name = "cobald_status" -pool_statistics.whitelist = Pool, +pool_statistics.whitelist = (Pool,) pool_statistics.logging_formatter = { LoggingSocketHandler.__name__: JsonFormatter(), logging.StreamHandler.__name__: JsonFormatter(), LoggingUDPSocketHandler.__name__: LineProtocolFormatter( - tags={"tardis", "pool_configuration", "pool_type", "pool"}, - resolution=1 - ) + tags={"tardis", "pool_configuration", "pool_type", "pool"}, resolution=1 + ), } diff --git a/lapis/monitor/general.py b/lapis/monitor/general.py index 4291ce8..cb9136b 100644 --- a/lapis/monitor/general.py +++ b/lapis/monitor/general.py @@ -26,29 +26,30 @@ def resource_statistics(drone: Drone) -> list: resources = drone.theoretical_available_resources used_resources = drone.available_resources for resource_type in resources: - results.append({ - "resource_type": resource_type, - "pool_configuration": "None", - "pool_type": "drone", - "pool": repr(drone), - "used_ratio": - 1 - used_resources[resource_type] - / drone.pool_resources[resource_type], - "requested_ratio": - 1 - resources[resource_type] / drone.pool_resources[resource_type] - }) + results.append( + { + "resource_type": resource_type, + "pool_configuration": "None", + "pool_type": "drone", + "pool": repr(drone), + "used_ratio": 1 + - used_resources[resource_type] / drone.pool_resources[resource_type], + "requested_ratio": 1 + - resources[resource_type] / drone.pool_resources[resource_type], + } + ) return results resource_statistics.name = "resource_status" -resource_statistics.whitelist = Drone, +resource_statistics.whitelist = (Drone,) resource_statistics.logging_formatter = { LoggingSocketHandler.__name__: JsonFormatter(), logging.StreamHandler.__name__: JsonFormatter(), LoggingUDPSocketHandler.__name__: LineProtocolFormatter( tags={"tardis", "resource_type", "pool_configuration", "pool_type", "pool"}, - resolution=1 - ) + resolution=1, + ), } @@ -59,21 +60,18 @@ def user_demand(job_queue: JobQueue) -> list: :param scheduler: the scheduler :return: list of records for logging """ - result = [{ - "value": len(job_queue) - }] + result = [{"value": len(job_queue)}] return result user_demand.name = "user_demand" -user_demand.whitelist = JobQueue, +user_demand.whitelist = (JobQueue,) user_demand.logging_formatter = { LoggingSocketHandler.__name__: JsonFormatter(), logging.StreamHandler.__name__: JsonFormatter(), LoggingUDPSocketHandler.__name__: LineProtocolFormatter( - tags={"tardis"}, - resolution=1 - ) + tags={"tardis"}, resolution=1 + ), } @@ -94,23 +92,24 @@ def job_statistics(scheduler: CondorJobScheduler) -> list: for cluster in scheduler.drone_cluster.copy(): for drone in cluster: result += drone.jobs - return [{ - "pool_configuration": "None", - "pool_type": "obs", - "pool": repr(scheduler), - "job_count": result - }] + return [ + { + "pool_configuration": "None", + "pool_type": "obs", + "pool": repr(scheduler), + "job_count": result, + } + ] job_statistics.name = "cobald_status" -job_statistics.whitelist = CondorJobScheduler, +job_statistics.whitelist = (CondorJobScheduler,) job_statistics.logging_formatter = { LoggingSocketHandler.__name__: JsonFormatter(), logging.StreamHandler.__name__: JsonFormatter(), LoggingUDPSocketHandler.__name__: LineProtocolFormatter( - tags={"tardis", "pool_configuration", "pool_type", "pool"}, - resolution=1 - ) + tags={"tardis", "pool_configuration", "pool_type", "pool"}, resolution=1 + ), } @@ -152,7 +151,8 @@ def job_events(job: Job) -> list: error_logged = False for resource_key in job.resources: usage = job.used_resources.get( - resource_key, job.resources.get(resource_key, None)) + resource_key, job.resources.get(resource_key, None) + ) value = usage / job.resources.get( resource_key, job.drone.pool_resources[resource_key] ) @@ -165,14 +165,13 @@ def job_events(job: Job) -> list: job_events.name = "job_event" -job_events.whitelist = Job, +job_events.whitelist = (Job,) job_events.logging_formatter = { LoggingSocketHandler.__name__: JsonFormatter(), logging.StreamHandler.__name__: JsonFormatter(), LoggingUDPSocketHandler.__name__: LineProtocolFormatter( - tags={"tardis", "pool_configuration", "pool_type", "pool", "job"}, - resolution=1 - ) + tags={"tardis", "pool_configuration", "pool_type", "pool", "job"}, resolution=1 + ), } @@ -187,14 +186,14 @@ def pool_status(pool: Pool) -> list: pool_status.name = "pool_status" -pool_status.whitelist = Pool, +pool_status.whitelist = (Pool,) pool_status.logging_formatter = { LoggingSocketHandler.__name__: JsonFormatter(), logging.StreamHandler.__name__: JsonFormatter(), LoggingUDPSocketHandler.__name__: LineProtocolFormatter( tags={"tardis", "parent_pool", "pool_configuration", "pool_type", "pool"}, - resolution=1 - ) + resolution=1, + ), } @@ -213,7 +212,6 @@ def configuration_information(simulator: "Simulator") -> list: LoggingSocketHandler.__name__: JsonFormatter(), logging.StreamHandler.__name__: JsonFormatter(), LoggingUDPSocketHandler.__name__: LineProtocolFormatter( - tags={"tardis", "pool_configuration", "resource_type"}, - resolution=1 - ) + tags={"tardis", "pool_configuration", "resource_type"}, resolution=1 + ), } diff --git a/lapis/pool.py b/lapis/pool.py index eb4ef38..b7b8166 100644 --- a/lapis/pool.py +++ b/lapis/pool.py @@ -16,11 +16,15 @@ class Pool(interfaces.Pool): :param name: Name of the pool :param make_drone: Callable to create a drone with specific properties for this pool """ - def __init__(self, make_drone: Callable, - *, - capacity: int = float('inf'), - init: int = 0, - name: str = None): + + def __init__( + self, + make_drone: Callable, + *, + capacity: int = float("inf"), + init: int = 0, + name: str = None + ): super(Pool, self).__init__() assert init <= capacity self.make_drone = make_drone @@ -115,8 +119,7 @@ def demand(self, value: float): self._demand = 0 def __repr__(self): - return '<%s: %s>' % ( - self.__class__.__name__, self._name or id(self)) + return "<%s: %s>" % (self.__class__.__name__, self._name or id(self)) class StaticPool(Pool): @@ -130,10 +133,12 @@ class StaticPool(Pool): :param resources: Dictionary of resources available for each pool instantiated within the pool """ + def __init__(self, make_drone: Callable, capacity: int = 0): assert capacity > 0, "Static pool was initialised without any resources..." - super(StaticPool, self).__init__(capacity=capacity, init=capacity, - make_drone=make_drone) + super(StaticPool, self).__init__( + capacity=capacity, init=capacity, make_drone=make_drone + ) self._demand = capacity async def run(self): diff --git a/lapis/pool_io/htcondor.py b/lapis/pool_io/htcondor.py index 0dd3e90..314eb1e 100644 --- a/lapis/pool_io/htcondor.py +++ b/lapis/pool_io/htcondor.py @@ -5,15 +5,21 @@ from ..pool import Pool -def htcondor_pool_reader(iterable, resource_name_mapping: dict = { # noqa: B006 - "cores": "TotalSlotCPUs", - "disk": "TotalSlotDisk", # MiB - "memory": "TotalSlotMemory" # MiB -}, unit_conversion_mapping: dict = { # noqa: B006 - "TotalSlotCPUs": 1, - "TotalSlotDisk": 1.024 / 1024, - "TotalSlotMemory": 1.024 / 1024 -}, pool_type: Callable = Pool, make_drone: Callable = None): +def htcondor_pool_reader( + iterable, + resource_name_mapping: dict = { # noqa: B006 + "cores": "TotalSlotCPUs", + "disk": "TotalSlotDisk", # MiB + "memory": "TotalSlotMemory", # MiB + }, + unit_conversion_mapping: dict = { # noqa: B006 + "TotalSlotCPUs": 1, + "TotalSlotDisk": 1.024 / 1024, + "TotalSlotMemory": 1.024 / 1024, + }, + pool_type: Callable = Pool, + make_drone: Callable = None, +): """ Load a pool configuration that was exported via htcondor from files or iterables @@ -26,7 +32,7 @@ def htcondor_pool_reader(iterable, resource_name_mapping: dict = { # noqa: B006 :return: Yields the :py:class:`Pool`s found in the given iterable """ assert make_drone - reader = csv.DictReader(iterable, delimiter=' ', skipinitialspace=True) + reader = csv.DictReader(iterable, delimiter=" ", skipinitialspace=True) for row in reader: try: capacity = int(row["Count"]) @@ -35,7 +41,12 @@ def htcondor_pool_reader(iterable, resource_name_mapping: dict = { # noqa: B006 capacity = float("Inf") yield pool_type( capacity=capacity, - make_drone=partial(make_drone, { - key: float(row[value]) * unit_conversion_mapping.get(value, 1) - for key, value in resource_name_mapping.items()}, - ignore_resources=["disk"])) + make_drone=partial( + make_drone, + { + key: float(row[value]) * unit_conversion_mapping.get(value, 1) + for key, value in resource_name_mapping.items() + }, + ignore_resources=["disk"], + ), + ) diff --git a/lapis/pool_io/machines.py b/lapis/pool_io/machines.py index d7272ac..30a2488 100644 --- a/lapis/pool_io/machines.py +++ b/lapis/pool_io/machines.py @@ -5,10 +5,15 @@ from ..pool import Pool -def machines_pool_reader(iterable, resource_name_mapping: dict = { # noqa: B006 - "cores": "CPUs_per_node", - "memory": "RAM_per_node_in_KB" -}, pool_type: Callable = Pool, make_drone: Callable = None): +def machines_pool_reader( + iterable, + resource_name_mapping: dict = { # noqa: B006 + "cores": "CPUs_per_node", + "memory": "RAM_per_node_in_KB", + }, + pool_type: Callable = Pool, + make_drone: Callable = None, +): """ Load a pool configuration that was exported via htcondor from files or iterables @@ -21,12 +26,16 @@ def machines_pool_reader(iterable, resource_name_mapping: dict = { # noqa: B006 :return: Yields the :py:class:`StaticPool`s found in the given iterable """ assert make_drone - reader = csv.DictReader(iterable, delimiter=' ', skipinitialspace=True) + reader = csv.DictReader(iterable, delimiter=" ", skipinitialspace=True) for row in reader: yield pool_type( capacity=int(row["number_of_nodes"]), - make_drone=partial(make_drone, { - key: float(row[value]) for key, value in - resource_name_mapping.items()}), - name=row["cluster_name"] + make_drone=partial( + make_drone, + { + key: float(row[value]) + for key, value in resource_name_mapping.items() + }, + ), + name=row["cluster_name"], ) diff --git a/lapis/scheduler.py b/lapis/scheduler.py index 02f9e03..c3ad63e 100644 --- a/lapis/scheduler.py +++ b/lapis/scheduler.py @@ -58,16 +58,17 @@ def _add_drone(self, drone: Drone, drone_resources: Dict = None): if len(self.drone_cluster) > 0: for cluster in self.drone_cluster: current_distance = 0 - for key in {*cluster[0].pool_resources, - *drone.pool_resources}: + for key in {*cluster[0].pool_resources, *drone.pool_resources}: if drone_resources: current_distance += abs( cluster[0].theoretical_available_resources.get(key, 0) - - drone_resources.get(key, 0)) + - drone_resources.get(key, 0) + ) else: current_distance += abs( cluster[0].theoretical_available_resources.get(key, 0) - - drone.theoretical_available_resources.get(key, 0)) + - drone.theoretical_available_resources.get(key, 0) + ) if current_distance < distance: minimum_distance_cluster = cluster distance = current_distance @@ -95,8 +96,8 @@ async def run(self): self.unregister_drone(best_match) left_resources = best_match.theoretical_available_resources left_resources = { - key: value - job.resources.get(key, 0) for - key, value in left_resources.items() + key: value - job.resources.get(key, 0) + for key, value in left_resources.items() } self._add_drone(best_match, left_resources) if not self._collecting and not self.job_queue: @@ -124,12 +125,14 @@ def _schedule_job(self, job) -> Drone: break else: try: - cost += 1 / (resources[resource_type] - // job.resources[resource_type]) + cost += 1 / ( + resources[resource_type] // job.resources[resource_type] + ) except KeyError: pass - for additional_resource_type in [key for key in drone.pool_resources - if key not in job.resources]: + for additional_resource_type in [ + key for key in drone.pool_resources if key not in job.resources + ]: cost += resources[additional_resource_type] cost /= len((*job.resources, *drone.pool_resources)) if cost <= 1: diff --git a/lapis/simulator.py b/lapis/simulator.py index cf615a9..9689ae2 100644 --- a/lapis/simulator.py +++ b/lapis/simulator.py @@ -6,8 +6,14 @@ from lapis.drone import Drone from lapis.job import job_to_queue_scheduler -from lapis.monitor.general import user_demand, job_statistics, \ - resource_statistics, pool_status, configuration_information, job_events +from lapis.monitor.general import ( + user_demand, + job_statistics, + resource_statistics, + pool_status, + configuration_information, + job_events, +) from lapis.monitor import Monitoring from lapis.monitor.cobald import drone_statistics, pool_statistics @@ -44,8 +50,11 @@ def create_job_generator(self, job_input, job_reader): def create_pools(self, pool_input, pool_reader, pool_type, controller=None): assert self.job_scheduler, "Scheduler needs to be created before pools" - for pool in pool_reader(iterable=pool_input, pool_type=pool_type, - make_drone=partial(Drone, self.job_scheduler)): + for pool in pool_reader( + iterable=pool_input, + pool_type=pool_type, + make_drone=partial(Drone, self.job_scheduler), + ): self.pools.append(pool) if controller: self.controllers.append(controller(target=pool, rate=1)) @@ -71,5 +80,6 @@ async def _simulate(self, end): print(f"Finished simulation at {time.now}") async def _queue_jobs(self, job_input, job_reader): - await job_to_queue_scheduler(job_generator=job_reader(job_input), - job_queue=self.job_queue) + await job_to_queue_scheduler( + job_generator=job_reader(job_input), job_queue=self.job_queue + ) diff --git a/lapis_tests/__init__.py b/lapis_tests/__init__.py index fde813e..d0c54e8 100644 --- a/lapis_tests/__init__.py +++ b/lapis_tests/__init__.py @@ -19,6 +19,7 @@ async def test_sleep(): after = time.now assert after - before == 20 """ + @wraps(test_case) def run_test(*args, **kwargs): # pytest currently ignores __tracebackhide__ if we re-raise @@ -26,10 +27,11 @@ def run_test(*args, **kwargs): __tracebackhide__ = True # >>> This is not the frame you are looking for. Do read on. <<< return run(test_case(*args, **kwargs)) + return run_test -class DummyScheduler(): +class DummyScheduler: @staticmethod def register_drone(drone: Drone): pass diff --git a/lapis_tests/job_io/test_htcondor.py b/lapis_tests/job_io/test_htcondor.py index c6ed66b..3e110f5 100644 --- a/lapis_tests/job_io/test_htcondor.py +++ b/lapis_tests/job_io/test_htcondor.py @@ -5,15 +5,17 @@ class TestHtcondorJobReader(object): def test_simple_read(self): - with open(os.path.join(os.path.dirname(__file__), "..", "data", - "htcondor_jobs.csv")) as input_file: + with open( + os.path.join(os.path.dirname(__file__), "..", "data", "htcondor_jobs.csv") + ) as input_file: jobs = 0 for job in htcondor_job_reader(input_file): assert job is not None jobs += 1 assert jobs > 0 - with open(os.path.join(os.path.dirname(__file__), "..", "data", - "htcondor_jobs.csv")) as input_file: + with open( + os.path.join(os.path.dirname(__file__), "..", "data", "htcondor_jobs.csv") + ) as input_file: # ensure that one job was removed by importer (wrong walltime given) lines = sum(1 for _ in input_file) assert jobs == (lines - 2) diff --git a/lapis_tests/job_io/test_swf.py b/lapis_tests/job_io/test_swf.py index 04bca06..ad811ba 100644 --- a/lapis_tests/job_io/test_swf.py +++ b/lapis_tests/job_io/test_swf.py @@ -4,8 +4,9 @@ class TestSwfJobReader(object): def test_simple_read(self): - with open(os.path.join(os.path.dirname(__file__), "..", "data", - "swf_jobs.swf")) as input_file: + with open( + os.path.join(os.path.dirname(__file__), "..", "data", "swf_jobs.swf") + ) as input_file: job_count = 0 for job in swf_job_reader(input_file): assert job is not None diff --git a/lapis_tests/test_job.py b/lapis_tests/test_job.py index dcf1828..13b4090 100644 --- a/lapis_tests/test_job.py +++ b/lapis_tests/test_job.py @@ -40,11 +40,13 @@ async def test_job_in_drone(self): scheduler = DummyScheduler() job = Job( resources={"walltime": 50, "cores": 1, "memory": 1}, - used_resources={"walltime": 10, "cores": 1, "memory": 1}) + used_resources={"walltime": 10, "cores": 1, "memory": 1}, + ) drone = Drone( scheduler=scheduler, pool_resources={"cores": 1, "memory": 1}, - scheduling_duration=0) + scheduling_duration=0, + ) async with Scope() as scope: scope.do(drone.start_job(job=job)) assert 10 == time @@ -56,11 +58,13 @@ async def test_nonmatching_job_in_drone(self): scheduler = DummyScheduler() job = Job( resources={"walltime": 50, "cores": 2, "memory": 1}, - used_resources={"walltime": 10, "cores": 1, "memory": 1}) + used_resources={"walltime": 10, "cores": 1, "memory": 1}, + ) drone = Drone( scheduler=scheduler, pool_resources={"cores": 1, "memory": 1}, - scheduling_duration=0) + scheduling_duration=0, + ) async with Scope() as scope: scope.do(drone.start_job(job=job)) assert 0 == time @@ -72,14 +76,17 @@ async def test_two_nonmatching_jobs(self): scheduler = DummyScheduler() job_one = Job( resources={"walltime": 50, "cores": 1, "memory": 1}, - used_resources={"walltime": 10, "cores": 1, "memory": 1}) + used_resources={"walltime": 10, "cores": 1, "memory": 1}, + ) job_two = Job( resources={"walltime": 50, "cores": 1, "memory": 1}, - used_resources={"walltime": 10, "cores": 1, "memory": 1}) + used_resources={"walltime": 10, "cores": 1, "memory": 1}, + ) drone = Drone( scheduler=scheduler, pool_resources={"cores": 1, "memory": 1}, - scheduling_duration=0) + scheduling_duration=0, + ) async with Scope() as scope: scope.do(drone.start_job(job=job_one)) scope.do(drone.start_job(job=job_two)) @@ -94,14 +101,17 @@ async def test_two_matching_jobs(self): scheduler = DummyScheduler() job_one = Job( resources={"walltime": 50, "cores": 1, "memory": 1}, - used_resources={"walltime": 10, "cores": 1, "memory": 1}) + used_resources={"walltime": 10, "cores": 1, "memory": 1}, + ) job_two = Job( resources={"walltime": 50, "cores": 1, "memory": 1}, - used_resources={"walltime": 10, "cores": 1, "memory": 1}) + used_resources={"walltime": 10, "cores": 1, "memory": 1}, + ) drone = Drone( scheduler=scheduler, pool_resources={"cores": 2, "memory": 2}, - scheduling_duration=0) + scheduling_duration=0, + ) async with Scope() as scope: scope.do(drone.start_job(job=job_one)) scope.do(drone.start_job(job=job_two)) diff --git a/lapis_tests/utility/__init__.py b/lapis_tests/utility/__init__.py index a90cf2b..0decb18 100644 --- a/lapis_tests/utility/__init__.py +++ b/lapis_tests/utility/__init__.py @@ -20,10 +20,10 @@ def clear(self): _index_lock = threading.Lock() -def make_test_logger(base_name: str = 'test_logger'): +def make_test_logger(base_name: str = "test_logger"): with _index_lock: global _test_index - log_name = base_name + '.test%d' % _test_index + log_name = base_name + ".test%d" % _test_index _test_index += 1 logger = logging.getLogger(log_name) logger.propagate = False diff --git a/lapis_tests/utility/test_monitor.py b/lapis_tests/utility/test_monitor.py index 0e2cf9e..533e9eb 100644 --- a/lapis_tests/utility/test_monitor.py +++ b/lapis_tests/utility/test_monitor.py @@ -14,19 +14,19 @@ def parse_line_protocol(literal: str): - name_tags, _, fields_stamp = literal.strip().partition(' ') - fields, _, stamp = fields_stamp.partition(' ') - fields = fields.split(',') if fields else [] - name, *tags = name_tags.split(',') - return name, { - key: value - for key, value - in (tag.split('=') for tag in tags) - }, { - key: ast.literal_eval(value) - for key, value - in (field.split('=') for field in fields) - }, None if not stamp else int(stamp) + name_tags, _, fields_stamp = literal.strip().partition(" ") + fields, _, stamp = fields_stamp.partition(" ") + fields = fields.split(",") if fields else [] + name, *tags = name_tags.split(",") + return ( + name, + {key: value for key, value in (tag.split("=") for tag in tags)}, + { + key: ast.literal_eval(value) + for key, value in (field.split("=") for field in fields) + }, + None if not stamp else int(stamp), + ) class TestSimulationTimeFilter(object): @@ -49,6 +49,7 @@ async def test_simple(self): async def test_explicit(self): def record(): pass + record.created = pytime() filter = SimulationTimeFilter() async with Scope() as _: @@ -79,6 +80,6 @@ def test_registration_failure(self): statistics.logging_formatter = {} monitoring.register_statistic(statistics) assert all(statistics not in stat for stat in monitoring._statistics.values()) - statistics.whitelist = str, + statistics.whitelist = (str,) monitoring.register_statistic(statistics) assert all(statistics in stat for stat in monitoring._statistics.values()) From 514bf57ecad6e4f47cfa467e89a3b85c205b00ae Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Sun, 27 Oct 2019 22:34:00 +0100 Subject: [PATCH 253/648] bumped version to 0.3.0 --- lapis/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapis/__init__.py b/lapis/__init__.py index 8e1bb65..ebe7177 100644 --- a/lapis/__init__.py +++ b/lapis/__init__.py @@ -1,3 +1,3 @@ """Lapis is an adaptable, performant, and interactive scheduling (Lapis) simulator""" -__version__ = "0.2.0" +__version__ = "0.3.0" From 85c5580e41cbc7206fadeae7a7dec4ec2027a5a0 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Sun, 27 Oct 2019 22:37:47 +0100 Subject: [PATCH 254/648] corrected paths for black --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index f92b60a..457dcbf 100644 --- a/.travis.yml +++ b/.travis.yml @@ -20,7 +20,7 @@ script: - python -m flake8 - | if [[ $TRAVIS_PYTHON_VERSION != 'pypy3'* ]]; then - python -m black --target-version py36 --check src/ cobald_tests/ + python -m black --target-version py36 --check lapis/ lapis_tests/ fi - python -m coverage run -m pytest after_success: From f1eaff0d1ade9f2bc88c1c7e0a1af3af311ea18b Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Sun, 27 Oct 2019 23:00:57 +0100 Subject: [PATCH 255/648] added missing formatting --- .pre-commit-config.yaml | 2 ++ lapis/pool.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 5f10048..31e2e6e 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -9,3 +9,5 @@ repos: rev: 19.3b0 hooks: - id: black + args: + - --py36 diff --git a/lapis/pool.py b/lapis/pool.py index b7b8166..928b710 100644 --- a/lapis/pool.py +++ b/lapis/pool.py @@ -23,7 +23,7 @@ def __init__( *, capacity: int = float("inf"), init: int = 0, - name: str = None + name: str = None, ): super(Pool, self).__init__() assert init <= capacity From 0f37f289f43109fbfac1f83951aade4af9a4b803 Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Mon, 28 Oct 2019 10:40:02 +0100 Subject: [PATCH 256/648] Updated to newest version and resolved merge conflicts --- lapis/drone.py | 2 ++ lapis/job.py | 7 +++++++ lapis/scheduler.py | 9 +++++++++ 3 files changed, 18 insertions(+) diff --git a/lapis/drone.py b/lapis/drone.py index b6d9cab..a5ac53f 100644 --- a/lapis/drone.py +++ b/lapis/drone.py @@ -126,6 +126,7 @@ async def start_job(self, job: Job, kill: bool = False): async with self.resources.claim( **job.resources ), self.used_resources.claim(**job.used_resources): + print('awaiting completion of job {} on drone {}'.format(job.__repr__(), self.__repr__())) await sampling_required.put(self) if kill: for resource_key in job.resources: @@ -140,6 +141,7 @@ async def start_job(self, job: Job, kill: bool = False): pass self.scheduler.update_drone(self) await job_execution.done + print('job {} done'.format(job.__repr__())) except ResourcesUnavailable: await instant job_execution.cancel() diff --git a/lapis/job.py b/lapis/job.py index 3e722c2..801805f 100644 --- a/lapis/job.py +++ b/lapis/job.py @@ -90,18 +90,25 @@ def waiting_time(self) -> float: async def run(self): self.in_queue_until = time.now self._success = None + print('starting job {} @ time {}'.format(self.__repr__(), self.in_queue_until)) + await sampling_required.put(self) try: + print('awaiting completion of job {} @ time {}'.format(self.__repr__(), time+self.walltime)) await (time + self.walltime) except CancelTask: + print('failed job {}'.format(self.__repr__())) self._success = False except BaseException: + print('failed job {}'.format(self.__repr__())) self._success = False raise else: self._success = True await sampling_required.put(self) + print('finish job {} @ time {}, successful {}'.format(self.__repr__(), time.now, self._success)) + def __repr__(self): return "<%s: %s>" % (self.__class__.__name__, self._name or id(self)) diff --git a/lapis/scheduler.py b/lapis/scheduler.py index c3ad63e..33c64f7 100644 --- a/lapis/scheduler.py +++ b/lapis/scheduler.py @@ -41,6 +41,8 @@ def drone_list(self): def register_drone(self, drone: Drone): self._add_drone(drone) + print('registered drone {} with resources {}'.format(drone.__repr__(), drone.available_resources)) + def unregister_drone(self, drone: Drone): for cluster in self.drone_cluster: @@ -82,14 +84,20 @@ def _add_drone(self, drone: Drone, drone_resources: Dict = None): def update_drone(self, drone: Drone): self.unregister_drone(drone) self._add_drone(drone) + print('updated drone {} to resources {}'.format(drone.__repr__(), drone.available_resources)) async def run(self): async with Scope() as scope: scope.do(self._collect_jobs()) async for _ in interval(self.interval): + print('new interval') + print(self.job_queue) for job in self.job_queue: + print('scheduling job {}'.format(job.__repr__())) best_match = self._schedule_job(job) + print('best match for job {}: {}'.format(job.__repr__(), best_match.__repr__())) if best_match: + print('starting job {} in {}'.format(job.__repr__(), best_match.__repr__())) scope.do(best_match.start_job(job)) self.job_queue.remove(job) await sampling_required.put(self.job_queue) @@ -100,6 +108,7 @@ async def run(self): for key, value in left_resources.items() } self._add_drone(best_match, left_resources) + if not self._collecting and not self.job_queue: break await sampling_required.put(self) From b30e641f418eb42fbebf132f5bac93384d4395fe Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Tue, 29 Oct 2019 22:29:37 +0100 Subject: [PATCH 257/648] corrected calculation and access to conversion mapping --- lapis/job_io/htcondor.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/lapis/job_io/htcondor.py b/lapis/job_io/htcondor.py index 09bd7e1..52e3a40 100644 --- a/lapis/job_io/htcondor.py +++ b/lapis/job_io/htcondor.py @@ -49,11 +49,10 @@ def htcondor_job_reader( pass used_resources = { "cores": ( - float(row["RemoteSysCpu"]) - + float(row["RemoteUserCpu"]) + (float(row["RemoteSysCpu"]) + float(row["RemoteUserCpu"])) / float(row[used_resource_name_mapping["walltime"]]) ) - * unit_conversion_mapping.get(used_resource_name_mapping[key], 1) + * unit_conversion_mapping.get(used_resource_name_mapping["cores"], 1) } for key in ["memory", "walltime", "disk"]: original_key = used_resource_name_mapping[key] From a1b8f1665cfbdc7e3fd01159aa87c9817934f64c Mon Sep 17 00:00:00 2001 From: Max Fischer Date: Wed, 30 Oct 2019 09:14:51 +0100 Subject: [PATCH 258/648] Skip black for pypy --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 4ead1c6..f33c643 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -35,7 +35,7 @@ test = [ "pytest >=4.3.0", "flake8", "flake8-bugbear", - "black", + "black; implementation_name=='pypy'", ] doc = ["sphinx", "sphinx_rtd_theme"] dev = ["pre-commit"] From 0ad29b2f24cdccb0827baa861e1aa27cf593b376 Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Wed, 30 Oct 2019 09:42:11 +0100 Subject: [PATCH 259/648] extended Job class attributes by inputfiles and cpu_efficiency --- lapis/job.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/lapis/job.py b/lapis/job.py index 801805f..361d577 100644 --- a/lapis/job.py +++ b/lapis/job.py @@ -16,7 +16,9 @@ class Job(object): "used_resources", "walltime", "requested_walltime", + "cpu_efficiency", "queue_date", + "inputfiles", "in_queue_since", "in_queue_until", "_name", @@ -28,8 +30,10 @@ def __init__( self, resources: dict, used_resources: dict, + cpu_efficiency: Optional[float] = None, in_queue_since: float = 0, queue_date: float = 0, + inputfiles: dict = dict(), name: str = None, drone: "Drone" = None, ): @@ -43,6 +47,7 @@ def __init__( :param used_resources: Resource usage of the job :param in_queue_since: Time when job was inserted into the queue of the simulation scheduler + :param queue_date: Time when job was inserted into queue in real life :param name: Name of the job :param drone: Drone where the job is running on @@ -57,9 +62,11 @@ def __init__( self.used_resources[key], ) self.resources[key] = self.used_resources[key] + self.cpu_efficiency = cpu_efficiency self.walltime = used_resources.pop("walltime") self.requested_walltime = resources.pop("walltime", None) self.queue_date = queue_date + self.inputfiles = inputfiles assert in_queue_since >= 0, "Queue time cannot be negative" self.in_queue_since = in_queue_since self.in_queue_until = None @@ -87,28 +94,25 @@ def waiting_time(self) -> float: return self.in_queue_until - self.in_queue_since return float("Inf") + @property + def has_inputfiles(self) -> bool: + return bool(self.inputfiles) + async def run(self): self.in_queue_until = time.now self._success = None - print('starting job {} @ time {}'.format(self.__repr__(), self.in_queue_until)) - await sampling_required.put(self) try: - print('awaiting completion of job {} @ time {}'.format(self.__repr__(), time+self.walltime)) await (time + self.walltime) except CancelTask: - print('failed job {}'.format(self.__repr__())) self._success = False except BaseException: - print('failed job {}'.format(self.__repr__())) self._success = False raise else: self._success = True await sampling_required.put(self) - print('finish job {} @ time {}, successful {}'.format(self.__repr__(), time.now, self._success)) - def __repr__(self): return "<%s: %s>" % (self.__class__.__name__, self._name or id(self)) From 589b81488ff338ec760a7633eddd56d668f4e1ad Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Wed, 30 Oct 2019 09:44:01 +0100 Subject: [PATCH 260/648] added json readout to htcondor job reader --- lapis/job_io/htcondor.py | 39 +++++++++++++++++++++++++++------------ 1 file changed, 27 insertions(+), 12 deletions(-) diff --git a/lapis/job_io/htcondor.py b/lapis/job_io/htcondor.py index 09bd7e1..cbb5746 100644 --- a/lapis/job_io/htcondor.py +++ b/lapis/job_io/htcondor.py @@ -1,4 +1,5 @@ import csv +import json import logging from lapis.job import Job @@ -31,37 +32,51 @@ def htcondor_job_reader( "DiskUsage_RAW": 1.024 / 1024 / 1024, }, ): - htcondor_reader = csv.DictReader(iterable, delimiter=" ", quotechar="'") + input_file_type = iterable.name.split(".")[-1] + if input_file_type == "json": + htcondor_reader = json.load(iterable) + elif input_file_type == "csv": + htcondor_reader = csv.DictReader(iterable, delimiter=" ", quotechar="'") + else: + print("Invalid input file type {}. Job input file can not be read.".format(input_file_type)) - for row in htcondor_reader: - if float(row[used_resource_name_mapping["walltime"]]) <= 0: + for entry in htcondor_reader: + if float(entry[used_resource_name_mapping["walltime"]]) <= 0: logging.getLogger("implementation").warning( - "removed job from htcondor import (%s)", row + "removed job from htcondor import (%s)", entry ) continue resources = {} for key, original_key in resource_name_mapping.items(): try: - resources[key] = float(row[original_key]) * unit_conversion_mapping.get( + resources[key] = float(entry[original_key]) * unit_conversion_mapping.get( original_key, 1 ) except ValueError: pass used_resources = { - "cores": ( - float(row["RemoteSysCpu"]) - + float(row["RemoteUserCpu"]) - / float(row[used_resource_name_mapping["walltime"]]) - ) + "cores": (( + float(entry["RemoteSysCpu"]) + + float(entry["RemoteUserCpu"])) + / float(entry[used_resource_name_mapping["walltime"]]) + * unit_conversion_mapping.get(used_resource_name_mapping[key], 1) } + try: + inputfiles = {file["filename"]: file["usedsize"] for file in entry["Inputfiles"]} + except KeyError: + inputfiles = dict() for key in ["memory", "walltime", "disk"]: original_key = used_resource_name_mapping[key] used_resources[key] = float( - row[original_key] + entry[original_key] ) * unit_conversion_mapping.get(original_key, 1) + cpu_efficiency = (float(entry["RemoteSysCpu"]) + float(entry["RemoteUserCpu"])) \ + / float(entry[used_resource_name_mapping["walltime"]]) yield Job( resources=resources, used_resources=used_resources, - queue_date=float(row[used_resource_name_mapping["queuetime"]]), + queue_date=float(entry[used_resource_name_mapping["queuetime"]]), + cpu_efficiency=cpu_efficiency, + inputfiles=inputfiles ) From ef7b2ea3dcb63b2cb9e84b4dc4a9d5b145932332 Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Mon, 28 Oct 2019 10:40:02 +0100 Subject: [PATCH 261/648] Updated to newest version and resolved merge conflicts --- lapis/drone.py | 2 ++ lapis/job.py | 7 +++++++ lapis/scheduler.py | 9 +++++++++ 3 files changed, 18 insertions(+) diff --git a/lapis/drone.py b/lapis/drone.py index b6d9cab..a5ac53f 100644 --- a/lapis/drone.py +++ b/lapis/drone.py @@ -126,6 +126,7 @@ async def start_job(self, job: Job, kill: bool = False): async with self.resources.claim( **job.resources ), self.used_resources.claim(**job.used_resources): + print('awaiting completion of job {} on drone {}'.format(job.__repr__(), self.__repr__())) await sampling_required.put(self) if kill: for resource_key in job.resources: @@ -140,6 +141,7 @@ async def start_job(self, job: Job, kill: bool = False): pass self.scheduler.update_drone(self) await job_execution.done + print('job {} done'.format(job.__repr__())) except ResourcesUnavailable: await instant job_execution.cancel() diff --git a/lapis/job.py b/lapis/job.py index 3e722c2..801805f 100644 --- a/lapis/job.py +++ b/lapis/job.py @@ -90,18 +90,25 @@ def waiting_time(self) -> float: async def run(self): self.in_queue_until = time.now self._success = None + print('starting job {} @ time {}'.format(self.__repr__(), self.in_queue_until)) + await sampling_required.put(self) try: + print('awaiting completion of job {} @ time {}'.format(self.__repr__(), time+self.walltime)) await (time + self.walltime) except CancelTask: + print('failed job {}'.format(self.__repr__())) self._success = False except BaseException: + print('failed job {}'.format(self.__repr__())) self._success = False raise else: self._success = True await sampling_required.put(self) + print('finish job {} @ time {}, successful {}'.format(self.__repr__(), time.now, self._success)) + def __repr__(self): return "<%s: %s>" % (self.__class__.__name__, self._name or id(self)) diff --git a/lapis/scheduler.py b/lapis/scheduler.py index c3ad63e..33c64f7 100644 --- a/lapis/scheduler.py +++ b/lapis/scheduler.py @@ -41,6 +41,8 @@ def drone_list(self): def register_drone(self, drone: Drone): self._add_drone(drone) + print('registered drone {} with resources {}'.format(drone.__repr__(), drone.available_resources)) + def unregister_drone(self, drone: Drone): for cluster in self.drone_cluster: @@ -82,14 +84,20 @@ def _add_drone(self, drone: Drone, drone_resources: Dict = None): def update_drone(self, drone: Drone): self.unregister_drone(drone) self._add_drone(drone) + print('updated drone {} to resources {}'.format(drone.__repr__(), drone.available_resources)) async def run(self): async with Scope() as scope: scope.do(self._collect_jobs()) async for _ in interval(self.interval): + print('new interval') + print(self.job_queue) for job in self.job_queue: + print('scheduling job {}'.format(job.__repr__())) best_match = self._schedule_job(job) + print('best match for job {}: {}'.format(job.__repr__(), best_match.__repr__())) if best_match: + print('starting job {} in {}'.format(job.__repr__(), best_match.__repr__())) scope.do(best_match.start_job(job)) self.job_queue.remove(job) await sampling_required.put(self.job_queue) @@ -100,6 +108,7 @@ async def run(self): for key, value in left_resources.items() } self._add_drone(best_match, left_resources) + if not self._collecting and not self.job_queue: break await sampling_required.put(self) From e7e674d914a0ea47d7b4bb8dcba36e6c56e38a2e Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Wed, 30 Oct 2019 09:42:11 +0100 Subject: [PATCH 262/648] extended Job class attributes by inputfiles and cpu_efficiency --- lapis/job.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/lapis/job.py b/lapis/job.py index 801805f..361d577 100644 --- a/lapis/job.py +++ b/lapis/job.py @@ -16,7 +16,9 @@ class Job(object): "used_resources", "walltime", "requested_walltime", + "cpu_efficiency", "queue_date", + "inputfiles", "in_queue_since", "in_queue_until", "_name", @@ -28,8 +30,10 @@ def __init__( self, resources: dict, used_resources: dict, + cpu_efficiency: Optional[float] = None, in_queue_since: float = 0, queue_date: float = 0, + inputfiles: dict = dict(), name: str = None, drone: "Drone" = None, ): @@ -43,6 +47,7 @@ def __init__( :param used_resources: Resource usage of the job :param in_queue_since: Time when job was inserted into the queue of the simulation scheduler + :param queue_date: Time when job was inserted into queue in real life :param name: Name of the job :param drone: Drone where the job is running on @@ -57,9 +62,11 @@ def __init__( self.used_resources[key], ) self.resources[key] = self.used_resources[key] + self.cpu_efficiency = cpu_efficiency self.walltime = used_resources.pop("walltime") self.requested_walltime = resources.pop("walltime", None) self.queue_date = queue_date + self.inputfiles = inputfiles assert in_queue_since >= 0, "Queue time cannot be negative" self.in_queue_since = in_queue_since self.in_queue_until = None @@ -87,28 +94,25 @@ def waiting_time(self) -> float: return self.in_queue_until - self.in_queue_since return float("Inf") + @property + def has_inputfiles(self) -> bool: + return bool(self.inputfiles) + async def run(self): self.in_queue_until = time.now self._success = None - print('starting job {} @ time {}'.format(self.__repr__(), self.in_queue_until)) - await sampling_required.put(self) try: - print('awaiting completion of job {} @ time {}'.format(self.__repr__(), time+self.walltime)) await (time + self.walltime) except CancelTask: - print('failed job {}'.format(self.__repr__())) self._success = False except BaseException: - print('failed job {}'.format(self.__repr__())) self._success = False raise else: self._success = True await sampling_required.put(self) - print('finish job {} @ time {}, successful {}'.format(self.__repr__(), time.now, self._success)) - def __repr__(self): return "<%s: %s>" % (self.__class__.__name__, self._name or id(self)) From b5b9c0dea48038057053260eda37874eea5a55ce Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Wed, 30 Oct 2019 09:44:01 +0100 Subject: [PATCH 263/648] added json readout to htcondor job reader --- lapis/job_io/htcondor.py | 33 ++++++++++++++++++++++++--------- 1 file changed, 24 insertions(+), 9 deletions(-) diff --git a/lapis/job_io/htcondor.py b/lapis/job_io/htcondor.py index 52e3a40..31e24a1 100644 --- a/lapis/job_io/htcondor.py +++ b/lapis/job_io/htcondor.py @@ -1,4 +1,5 @@ import csv +import json import logging from lapis.job import Job @@ -31,36 +32,50 @@ def htcondor_job_reader( "DiskUsage_RAW": 1.024 / 1024 / 1024, }, ): - htcondor_reader = csv.DictReader(iterable, delimiter=" ", quotechar="'") + input_file_type = iterable.name.split(".")[-1] + if input_file_type == "json": + htcondor_reader = json.load(iterable) + elif input_file_type == "csv": + htcondor_reader = csv.DictReader(iterable, delimiter=" ", quotechar="'") + else: + print("Invalid input file type {}. Job input file can not be read.".format(input_file_type)) - for row in htcondor_reader: - if float(row[used_resource_name_mapping["walltime"]]) <= 0: + for entry in htcondor_reader: + if float(entry[used_resource_name_mapping["walltime"]]) <= 0: logging.getLogger("implementation").warning( - "removed job from htcondor import (%s)", row + "removed job from htcondor import (%s)", entry ) continue resources = {} for key, original_key in resource_name_mapping.items(): try: - resources[key] = float(row[original_key]) * unit_conversion_mapping.get( + resources[key] = float(entry[original_key]) * unit_conversion_mapping.get( original_key, 1 ) except ValueError: pass used_resources = { "cores": ( - (float(row["RemoteSysCpu"]) + float(row["RemoteUserCpu"])) - / float(row[used_resource_name_mapping["walltime"]]) + (float(entry["RemoteSysCpu"]) + float(entry["RemoteUserCpu"])) + / float(entry[used_resource_name_mapping["walltime"]]) ) * unit_conversion_mapping.get(used_resource_name_mapping["cores"], 1) } + try: + inputfiles = {file["filename"]: file["usedsize"] for file in entry["Inputfiles"]} + except KeyError: + inputfiles = dict() for key in ["memory", "walltime", "disk"]: original_key = used_resource_name_mapping[key] used_resources[key] = float( - row[original_key] + entry[original_key] ) * unit_conversion_mapping.get(original_key, 1) + cpu_efficiency = (float(entry["RemoteSysCpu"]) + float(entry["RemoteUserCpu"])) \ + / float(entry[used_resource_name_mapping["walltime"]]) yield Job( resources=resources, used_resources=used_resources, - queue_date=float(row[used_resource_name_mapping["queuetime"]]), + queue_date=float(entry[used_resource_name_mapping["queuetime"]]), + cpu_efficiency=cpu_efficiency, + inputfiles=inputfiles ) From ab12f64ba928cc339480df5568c1506b44ffc40e Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Wed, 30 Oct 2019 10:04:44 +0100 Subject: [PATCH 264/648] removed debug outputs --- lapis/drone.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/lapis/drone.py b/lapis/drone.py index a5ac53f..b6d9cab 100644 --- a/lapis/drone.py +++ b/lapis/drone.py @@ -126,7 +126,6 @@ async def start_job(self, job: Job, kill: bool = False): async with self.resources.claim( **job.resources ), self.used_resources.claim(**job.used_resources): - print('awaiting completion of job {} on drone {}'.format(job.__repr__(), self.__repr__())) await sampling_required.put(self) if kill: for resource_key in job.resources: @@ -141,7 +140,6 @@ async def start_job(self, job: Job, kill: bool = False): pass self.scheduler.update_drone(self) await job_execution.done - print('job {} done'.format(job.__repr__())) except ResourcesUnavailable: await instant job_execution.cancel() From 42871469fd0e23bd2afd4d67b7a10366d00cfa3c Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Wed, 30 Oct 2019 10:04:44 +0100 Subject: [PATCH 265/648] removed accidently pushed debug outputs --- lapis/drone.py | 2 -- lapis/scheduler.py | 8 -------- 2 files changed, 10 deletions(-) diff --git a/lapis/drone.py b/lapis/drone.py index a5ac53f..b6d9cab 100644 --- a/lapis/drone.py +++ b/lapis/drone.py @@ -126,7 +126,6 @@ async def start_job(self, job: Job, kill: bool = False): async with self.resources.claim( **job.resources ), self.used_resources.claim(**job.used_resources): - print('awaiting completion of job {} on drone {}'.format(job.__repr__(), self.__repr__())) await sampling_required.put(self) if kill: for resource_key in job.resources: @@ -141,7 +140,6 @@ async def start_job(self, job: Job, kill: bool = False): pass self.scheduler.update_drone(self) await job_execution.done - print('job {} done'.format(job.__repr__())) except ResourcesUnavailable: await instant job_execution.cancel() diff --git a/lapis/scheduler.py b/lapis/scheduler.py index 33c64f7..8d41111 100644 --- a/lapis/scheduler.py +++ b/lapis/scheduler.py @@ -41,8 +41,6 @@ def drone_list(self): def register_drone(self, drone: Drone): self._add_drone(drone) - print('registered drone {} with resources {}'.format(drone.__repr__(), drone.available_resources)) - def unregister_drone(self, drone: Drone): for cluster in self.drone_cluster: @@ -84,20 +82,14 @@ def _add_drone(self, drone: Drone, drone_resources: Dict = None): def update_drone(self, drone: Drone): self.unregister_drone(drone) self._add_drone(drone) - print('updated drone {} to resources {}'.format(drone.__repr__(), drone.available_resources)) async def run(self): async with Scope() as scope: scope.do(self._collect_jobs()) async for _ in interval(self.interval): - print('new interval') - print(self.job_queue) for job in self.job_queue: - print('scheduling job {}'.format(job.__repr__())) best_match = self._schedule_job(job) - print('best match for job {}: {}'.format(job.__repr__(), best_match.__repr__())) if best_match: - print('starting job {} in {}'.format(job.__repr__(), best_match.__repr__())) scope.do(best_match.start_job(job)) self.job_queue.remove(job) await sampling_required.put(self.job_queue) From f105601efdb8e39182bc30c91825c171df7d13c1 Mon Sep 17 00:00:00 2001 From: Max Fischer Date: Wed, 30 Oct 2019 10:30:26 +0100 Subject: [PATCH 266/648] Update pyproject.toml --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index f33c643..9b22d0f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -35,7 +35,7 @@ test = [ "pytest >=4.3.0", "flake8", "flake8-bugbear", - "black; implementation_name=='pypy'", + "black; implementation_name=='cpython'", ] doc = ["sphinx", "sphinx_rtd_theme"] dev = ["pre-commit"] From aa5017feb1c46fde4e7b70a63db992f1f0a7f6ec Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Wed, 30 Oct 2019 10:31:32 +0100 Subject: [PATCH 267/648] resolved PEP 8 issues --- lapis/job_io/htcondor.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/lapis/job_io/htcondor.py b/lapis/job_io/htcondor.py index 31e24a1..cd32ab6 100644 --- a/lapis/job_io/htcondor.py +++ b/lapis/job_io/htcondor.py @@ -38,7 +38,8 @@ def htcondor_job_reader( elif input_file_type == "csv": htcondor_reader = csv.DictReader(iterable, delimiter=" ", quotechar="'") else: - print("Invalid input file type {}. Job input file can not be read.".format(input_file_type)) + print("Invalid input file type {}. Job input file can not be read.".format( + input_file_type)) for entry in htcondor_reader: if float(entry[used_resource_name_mapping["walltime"]]) <= 0: @@ -49,9 +50,8 @@ def htcondor_job_reader( resources = {} for key, original_key in resource_name_mapping.items(): try: - resources[key] = float(entry[original_key]) * unit_conversion_mapping.get( - original_key, 1 - ) + resources[key] = float(entry[original_key]) \ + * unit_conversion_mapping.get(original_key, 1) except ValueError: pass used_resources = { @@ -62,7 +62,8 @@ def htcondor_job_reader( * unit_conversion_mapping.get(used_resource_name_mapping["cores"], 1) } try: - inputfiles = {file["filename"]: file["usedsize"] for file in entry["Inputfiles"]} + inputfiles = {file["filename"]: file["usedsize"] + for file in entry["Inputfiles"]} except KeyError: inputfiles = dict() for key in ["memory", "walltime", "disk"]: @@ -70,7 +71,8 @@ def htcondor_job_reader( used_resources[key] = float( entry[original_key] ) * unit_conversion_mapping.get(original_key, 1) - cpu_efficiency = (float(entry["RemoteSysCpu"]) + float(entry["RemoteUserCpu"])) \ + cpu_efficiency = (float(entry["RemoteSysCpu"]) + + float(entry["RemoteUserCpu"])) \ / float(entry[used_resource_name_mapping["walltime"]]) yield Job( resources=resources, From 68f2451774d87d77e13c83fadd19bf9e489e69de Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Wed, 30 Oct 2019 11:02:18 +0100 Subject: [PATCH 268/648] resolved further PEP 8 issues --- lapis/job_io/htcondor.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/lapis/job_io/htcondor.py b/lapis/job_io/htcondor.py index cd32ab6..1ad5280 100644 --- a/lapis/job_io/htcondor.py +++ b/lapis/job_io/htcondor.py @@ -50,8 +50,8 @@ def htcondor_job_reader( resources = {} for key, original_key in resource_name_mapping.items(): try: - resources[key] = float(entry[original_key]) \ - * unit_conversion_mapping.get(original_key, 1) + resources[key] = float(entry[original_key]) * \ + unit_conversion_mapping.get(original_key, 1) except ValueError: pass used_resources = { @@ -68,12 +68,11 @@ def htcondor_job_reader( inputfiles = dict() for key in ["memory", "walltime", "disk"]: original_key = used_resource_name_mapping[key] - used_resources[key] = float( - entry[original_key] - ) * unit_conversion_mapping.get(original_key, 1) - cpu_efficiency = (float(entry["RemoteSysCpu"]) + - float(entry["RemoteUserCpu"])) \ - / float(entry[used_resource_name_mapping["walltime"]]) + used_resources[key] = float(entry[original_key]) \ + * unit_conversion_mapping.get(original_key, 1) + cpu_efficiency = (float(entry["RemoteSysCpu"]) + + float(entry["RemoteUserCpu"])) \ + / float(entry[used_resource_name_mapping["walltime"]]) yield Job( resources=resources, used_resources=used_resources, From 66b45681ec6c7b72d17eb6e29d27ed020c709de1 Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Wed, 30 Oct 2019 11:40:33 +0100 Subject: [PATCH 269/648] removed duplicate Job attribute cpu_efficiency and dispensable method has_inputfiles --- lapis/job.py | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/lapis/job.py b/lapis/job.py index 361d577..d1a048c 100644 --- a/lapis/job.py +++ b/lapis/job.py @@ -16,7 +16,6 @@ class Job(object): "used_resources", "walltime", "requested_walltime", - "cpu_efficiency", "queue_date", "inputfiles", "in_queue_since", @@ -30,10 +29,9 @@ def __init__( self, resources: dict, used_resources: dict, - cpu_efficiency: Optional[float] = None, in_queue_since: float = 0, queue_date: float = 0, - inputfiles: dict = dict(), + inputfiles: Optional[dict] = None, name: str = None, drone: "Drone" = None, ): @@ -47,7 +45,6 @@ def __init__( :param used_resources: Resource usage of the job :param in_queue_since: Time when job was inserted into the queue of the simulation scheduler - :param queue_date: Time when job was inserted into queue in real life :param name: Name of the job :param drone: Drone where the job is running on @@ -62,7 +59,6 @@ def __init__( self.used_resources[key], ) self.resources[key] = self.used_resources[key] - self.cpu_efficiency = cpu_efficiency self.walltime = used_resources.pop("walltime") self.requested_walltime = resources.pop("walltime", None) self.queue_date = queue_date @@ -94,10 +90,6 @@ def waiting_time(self) -> float: return self.in_queue_until - self.in_queue_since return float("Inf") - @property - def has_inputfiles(self) -> bool: - return bool(self.inputfiles) - async def run(self): self.in_queue_until = time.now self._success = None From a0eb0614601433568e960ac3e5061afed82fbc8b Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Wed, 30 Oct 2019 11:41:13 +0100 Subject: [PATCH 270/648] removed new line --- lapis/scheduler.py | 1 - 1 file changed, 1 deletion(-) diff --git a/lapis/scheduler.py b/lapis/scheduler.py index 8d41111..c3ad63e 100644 --- a/lapis/scheduler.py +++ b/lapis/scheduler.py @@ -100,7 +100,6 @@ async def run(self): for key, value in left_resources.items() } self._add_drone(best_match, left_resources) - if not self._collecting and not self.job_queue: break await sampling_required.put(self) From 6d27fa1abd81dd9a50963aed66315f7bb6111417 Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Wed, 30 Oct 2019 11:44:03 +0100 Subject: [PATCH 271/648] removed assignment of duplicate Job class attribute cpu_efficiency --- lapis/job_io/htcondor.py | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/lapis/job_io/htcondor.py b/lapis/job_io/htcondor.py index 1ad5280..0c07e76 100644 --- a/lapis/job_io/htcondor.py +++ b/lapis/job_io/htcondor.py @@ -32,14 +32,13 @@ def htcondor_job_reader( "DiskUsage_RAW": 1.024 / 1024 / 1024, }, ): - input_file_type = iterable.name.split(".")[-1] - if input_file_type == "json": + try: htcondor_reader = json.load(iterable) - elif input_file_type == "csv": + except ValueError: htcondor_reader = csv.DictReader(iterable, delimiter=" ", quotechar="'") else: - print("Invalid input file type {}. Job input file can not be read.".format( - input_file_type)) + logging.getLogger("implementation").error( + "Invalid input file %s. Job input file can not be read." % iterable.name) for entry in htcondor_reader: if float(entry[used_resource_name_mapping["walltime"]]) <= 0: @@ -65,18 +64,15 @@ def htcondor_job_reader( inputfiles = {file["filename"]: file["usedsize"] for file in entry["Inputfiles"]} except KeyError: - inputfiles = dict() + inputfiles = None for key in ["memory", "walltime", "disk"]: original_key = used_resource_name_mapping[key] used_resources[key] = float(entry[original_key]) \ * unit_conversion_mapping.get(original_key, 1) - cpu_efficiency = (float(entry["RemoteSysCpu"]) - + float(entry["RemoteUserCpu"])) \ - / float(entry[used_resource_name_mapping["walltime"]]) + yield Job( resources=resources, used_resources=used_resources, queue_date=float(entry[used_resource_name_mapping["queuetime"]]), - cpu_efficiency=cpu_efficiency, inputfiles=inputfiles ) From faab95e62a12b7096a9854e49dcfa22523469355 Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Wed, 30 Oct 2019 13:00:22 +0100 Subject: [PATCH 272/648] resolved further PEP 8 issues --- lapis/job_io/htcondor.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lapis/job_io/htcondor.py b/lapis/job_io/htcondor.py index 0c07e76..e5c2804 100644 --- a/lapis/job_io/htcondor.py +++ b/lapis/job_io/htcondor.py @@ -49,8 +49,8 @@ def htcondor_job_reader( resources = {} for key, original_key in resource_name_mapping.items(): try: - resources[key] = float(entry[original_key]) * \ - unit_conversion_mapping.get(original_key, 1) + resources[key] = float(entry[original_key]) \ + * unit_conversion_mapping.get(original_key, 1) except ValueError: pass used_resources = { From b5ee6faafb21dc5502776d76fdf07ef76dc2c072 Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Wed, 30 Oct 2019 14:09:33 +0100 Subject: [PATCH 273/648] changed passing of input file information to Job class to using resource dictionary --- lapis/job.py | 3 +-- lapis/job_io/htcondor.py | 30 +++++++++++++++++++----------- 2 files changed, 20 insertions(+), 13 deletions(-) diff --git a/lapis/job.py b/lapis/job.py index d1a048c..9354288 100644 --- a/lapis/job.py +++ b/lapis/job.py @@ -31,7 +31,6 @@ def __init__( used_resources: dict, in_queue_since: float = 0, queue_date: float = 0, - inputfiles: Optional[dict] = None, name: str = None, drone: "Drone" = None, ): @@ -61,8 +60,8 @@ def __init__( self.resources[key] = self.used_resources[key] self.walltime = used_resources.pop("walltime") self.requested_walltime = resources.pop("walltime", None) + self.inputfiles = resources.pop("inputfiles", None) self.queue_date = queue_date - self.inputfiles = inputfiles assert in_queue_since >= 0, "Queue time cannot be negative" self.in_queue_since = in_queue_since self.in_queue_until = None diff --git a/lapis/job_io/htcondor.py b/lapis/job_io/htcondor.py index e5c2804..7d3d5c6 100644 --- a/lapis/job_io/htcondor.py +++ b/lapis/job_io/htcondor.py @@ -38,7 +38,8 @@ def htcondor_job_reader( htcondor_reader = csv.DictReader(iterable, delimiter=" ", quotechar="'") else: logging.getLogger("implementation").error( - "Invalid input file %s. Job input file can not be read." % iterable.name) + "Invalid input file %s. Job input file can not be read." % iterable.name + ) for entry in htcondor_reader: if float(entry[used_resource_name_mapping["walltime"]]) <= 0: @@ -49,10 +50,12 @@ def htcondor_job_reader( resources = {} for key, original_key in resource_name_mapping.items(): try: - resources[key] = float(entry[original_key]) \ - * unit_conversion_mapping.get(original_key, 1) + resources[key] = float( + entry[original_key] + ) * unit_conversion_mapping.get(original_key, 1) except ValueError: pass + used_resources = { "cores": ( (float(entry["RemoteSysCpu"]) + float(entry["RemoteUserCpu"])) @@ -60,19 +63,24 @@ def htcondor_job_reader( ) * unit_conversion_mapping.get(used_resource_name_mapping["cores"], 1) } - try: - inputfiles = {file["filename"]: file["usedsize"] - for file in entry["Inputfiles"]} - except KeyError: - inputfiles = None for key in ["memory", "walltime", "disk"]: original_key = used_resource_name_mapping[key] - used_resources[key] = float(entry[original_key]) \ - * unit_conversion_mapping.get(original_key, 1) + used_resources[key] = float( + entry[original_key] + ) * unit_conversion_mapping.get(original_key, 1) + try: + resources["inputfiles"] = { + file["filename"]: { + "filesize": file["filesize"], + "usedsize": file["usedsize"], + } + for file in entry["Inputfiles"] + } + except KeyError: + pass yield Job( resources=resources, used_resources=used_resources, queue_date=float(entry[used_resource_name_mapping["queuetime"]]), - inputfiles=inputfiles ) From 2e2c1d4b8f1f2695d0440aec67e836bcedd2ea06 Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Wed, 30 Oct 2019 16:08:29 +0100 Subject: [PATCH 274/648] hotfix of csv/json job file readout --- lapis/job_io/htcondor.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lapis/job_io/htcondor.py b/lapis/job_io/htcondor.py index 7d3d5c6..bbc0113 100644 --- a/lapis/job_io/htcondor.py +++ b/lapis/job_io/htcondor.py @@ -32,15 +32,15 @@ def htcondor_job_reader( "DiskUsage_RAW": 1.024 / 1024 / 1024, }, ): - try: + input_file_type = iterable.name.split(".")[-1] + if input_file_type == "json": htcondor_reader = json.load(iterable) - except ValueError: + elif input_file_type == "csv": htcondor_reader = csv.DictReader(iterable, delimiter=" ", quotechar="'") else: logging.getLogger("implementation").error( "Invalid input file %s. Job input file can not be read." % iterable.name ) - for entry in htcondor_reader: if float(entry[used_resource_name_mapping["walltime"]]) <= 0: logging.getLogger("implementation").warning( From dbe35c6cf7fe6303f224001b3521324e9db48990 Mon Sep 17 00:00:00 2001 From: tfesenbecker <50665055+tfesenbecker@users.noreply.github.com> Date: Thu, 31 Oct 2019 10:30:04 +0100 Subject: [PATCH 275/648] Update lapis/job_io/htcondor.py Co-Authored-By: Eileen Kuehn --- lapis/job_io/htcondor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapis/job_io/htcondor.py b/lapis/job_io/htcondor.py index bbc0113..6b53e46 100644 --- a/lapis/job_io/htcondor.py +++ b/lapis/job_io/htcondor.py @@ -32,7 +32,7 @@ def htcondor_job_reader( "DiskUsage_RAW": 1.024 / 1024 / 1024, }, ): - input_file_type = iterable.name.split(".")[-1] + input_file_type = iterable.name.split(".")[-1].lower() if input_file_type == "json": htcondor_reader = json.load(iterable) elif input_file_type == "csv": From e430d4e92022ca3c75a7cd697bf1066c939a120a Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Thu, 31 Oct 2019 15:46:22 +0100 Subject: [PATCH 276/648] improved structure of input file dictionary --- lapis/job_io/htcondor.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/lapis/job_io/htcondor.py b/lapis/job_io/htcondor.py index bbc0113..c97a102 100644 --- a/lapis/job_io/htcondor.py +++ b/lapis/job_io/htcondor.py @@ -70,13 +70,7 @@ def htcondor_job_reader( ) * unit_conversion_mapping.get(original_key, 1) try: - resources["inputfiles"] = { - file["filename"]: { - "filesize": file["filesize"], - "usedsize": file["usedsize"], - } - for file in entry["Inputfiles"] - } + resources["inputfiles"] = entry["Inputfiles"] except KeyError: pass yield Job( From 0889480f64e5d0da49b9b07bd2b9f424b7d0d1b9 Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Fri, 1 Nov 2019 13:14:07 +0100 Subject: [PATCH 277/648] split job inputfile information into requested_inputfiles/used_inputfiles --- lapis/job.py | 6 ++++-- lapis/job_io/htcondor.py | 8 +++++++- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/lapis/job.py b/lapis/job.py index 9354288..934ceee 100644 --- a/lapis/job.py +++ b/lapis/job.py @@ -17,7 +17,8 @@ class Job(object): "walltime", "requested_walltime", "queue_date", - "inputfiles", + "requested_inputfiles", + "used_inputfiles", "in_queue_since", "in_queue_until", "_name", @@ -60,7 +61,8 @@ def __init__( self.resources[key] = self.used_resources[key] self.walltime = used_resources.pop("walltime") self.requested_walltime = resources.pop("walltime", None) - self.inputfiles = resources.pop("inputfiles", None) + self.requested_inputfiles = resources.pop("inputfiles", None) + self.used_inputfiles = used_resources.pop("inputfiles", None) self.queue_date = queue_date assert in_queue_since >= 0, "Queue time cannot be negative" self.in_queue_since = in_queue_since diff --git a/lapis/job_io/htcondor.py b/lapis/job_io/htcondor.py index e1b78c6..0667164 100644 --- a/lapis/job_io/htcondor.py +++ b/lapis/job_io/htcondor.py @@ -3,6 +3,7 @@ import logging from lapis.job import Job +from copy import deepcopy def htcondor_job_reader( @@ -70,7 +71,12 @@ def htcondor_job_reader( ) * unit_conversion_mapping.get(original_key, 1) try: - resources["inputfiles"] = entry["Inputfiles"] + resources["inputfiles"] = deepcopy(entry["Inputfiles"]) + for filename, filespecs in resources["inputfiles"].items(): + del filespecs["usedsize"] + used_resources["inputfiles"] = deepcopy(entry["Inputfiles"]) + for filename, filespecs in used_resources["inputfiles"].items(): + del filespecs["filesize"] except KeyError: pass yield Job( From 290162b8c6e5f3ac62b7f19ba2d17877aad28984 Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Fri, 1 Nov 2019 15:34:13 +0100 Subject: [PATCH 278/648] Updated lapis/job_io/htcondor.py --- lapis/job_io/htcondor.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lapis/job_io/htcondor.py b/lapis/job_io/htcondor.py index 0667164..651bd34 100644 --- a/lapis/job_io/htcondor.py +++ b/lapis/job_io/htcondor.py @@ -72,10 +72,10 @@ def htcondor_job_reader( try: resources["inputfiles"] = deepcopy(entry["Inputfiles"]) - for filename, filespecs in resources["inputfiles"].items(): + for _filename, filespecs in resources["inputfiles"].items(): del filespecs["usedsize"] used_resources["inputfiles"] = deepcopy(entry["Inputfiles"]) - for filename, filespecs in used_resources["inputfiles"].items(): + for _filename, filespecs in used_resources["inputfiles"].items(): del filespecs["filesize"] except KeyError: pass From 667c8d740d660b44115b8f825e618eadb8ddd2fa Mon Sep 17 00:00:00 2001 From: tfesenbecker <50665055+tfesenbecker@users.noreply.github.com> Date: Fri, 1 Nov 2019 15:38:29 +0100 Subject: [PATCH 279/648] Update lapis/job_io/htcondor.py Co-Authored-By: Eileen Kuehn --- lapis/job_io/htcondor.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lapis/job_io/htcondor.py b/lapis/job_io/htcondor.py index 651bd34..68364f3 100644 --- a/lapis/job_io/htcondor.py +++ b/lapis/job_io/htcondor.py @@ -73,7 +73,8 @@ def htcondor_job_reader( try: resources["inputfiles"] = deepcopy(entry["Inputfiles"]) for _filename, filespecs in resources["inputfiles"].items(): - del filespecs["usedsize"] + if "usedsize" in filespecs: + del filespecs["usedsize"] used_resources["inputfiles"] = deepcopy(entry["Inputfiles"]) for _filename, filespecs in used_resources["inputfiles"].items(): del filespecs["filesize"] From 3fed509f3cbdc632ddce6a4870ddbcb9d06ab3e2 Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Fri, 1 Nov 2019 15:49:13 +0100 Subject: [PATCH 280/648] fix duplicate for loop --- lapis/job_io/htcondor.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/lapis/job_io/htcondor.py b/lapis/job_io/htcondor.py index 68364f3..e5adcc1 100644 --- a/lapis/job_io/htcondor.py +++ b/lapis/job_io/htcondor.py @@ -72,12 +72,12 @@ def htcondor_job_reader( try: resources["inputfiles"] = deepcopy(entry["Inputfiles"]) - for _filename, filespecs in resources["inputfiles"].items(): - if "usedsize" in filespecs: - del filespecs["usedsize"] used_resources["inputfiles"] = deepcopy(entry["Inputfiles"]) - for _filename, filespecs in used_resources["inputfiles"].items(): - del filespecs["filesize"] + for filename, filespecs in entry["Inputfiles"].items(): + if "usedsize" in filespecs: + del resources["inputfiles"][filename]["usedsize"] + if "filesize" in filespecs: + del used_resources["inputfiles"][filename]["filesize"] except KeyError: pass yield Job( From 4e0e4ff8c7d2d413f6f2eca5cf660137486bbf36 Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Sat, 2 Nov 2019 10:39:54 +0100 Subject: [PATCH 281/648] unittest with strange JSONDecoderError --- lapis_tests/data/job_list_minimal.json | 113 +++++++++++++++++++++++++ lapis_tests/job_io/test_htcondor.py | 19 +++++ 2 files changed, 132 insertions(+) create mode 100644 lapis_tests/data/job_list_minimal.json diff --git a/lapis_tests/data/job_list_minimal.json b/lapis_tests/data/job_list_minimal.json new file mode 100644 index 0000000..726f76a --- /dev/null +++ b/lapis_tests/data/job_list_minimal.json @@ -0,0 +1,113 @@ +[ + { + "QDate": 1567169672, + "RequestCpus": 1, + "RequestWalltime": 60, + "RequestMemory": 2000, + "RequestDisk": 6000000, + "RemoteWallClockTime": 100.0, + "'Number of Allocated Processors'": 1, + "MemoryUsage": 2867, + "DiskUsage_RAW": 41898, + "RemoteSysCpu": 10.0, + "RemoteUserCpu": 40.0, + "CPUEfficiency": 0.7, + "Inputfiles": { + "a.root": { + "filesize": 25000, + "usedsize": 20000 + }, + "b.root": { + "filesize": 25000, + "usedsize": 20000 + }, + "c.root": { + "filesize": 25000, + "usedsize": 20000 + } + } + }, + { + "QDate": 1567155456, + "RequestCpus": 1, + "RequestWalltime": 60, + "RequestMemory": 2000, + "RequestDisk": 6000000, + "RemoteWallClockTime": 77.0, + "'Number of Allocated Processors'": 1, + "MemoryUsage": 1207, + "DiskUsage_RAW": 45562, + "RemoteSysCpu": 7.0, + "RemoteUserCpu": 50.0, + "CPUEfficiency": 0.7, + "Inputfiles": { + "a.root": { + "filesize": 25000, + "usedsize": 20000 + }, + "b.root": { + "filesize": 25000, + "usedsize": 20000 + }, + "c.root": { + "filesize": 25000, + "usedsize": 20000 + } + } + }, + { + "QDate": 1567155456, + "RequestCpus": 1, + "RequestWalltime": 60, + "RequestMemory": 2000, + "RequestDisk": 6000000, + "RemoteWallClockTime": 63.0, + "'Number of Allocated Processors'": 1, + "MemoryUsage": 1207, + "DiskUsage_RAW": 45562, + "RemoteSysCpu": 3.0, + "RemoteUserCpu": 40.0, + "Inputfiles": { + "a.root": { + "filesize": 25000, + "usedsize": 20000 + }, + "b.root": { + "filesize": 25000, + "usedsize": 20000 + }, + "c.root": { + "filesize": 25000, + "usedsize": 20000 + } + } + }, + { + "QDate": 1567155456, + "RequestCpus": 1, + "RequestWalltime": 60, + "RequestMemory": 2000, + "RequestDisk": 6000000, + "RemoteWallClockTime": 100.0, + "'Number of Allocated Processors'": 1, + "MemoryUsage": 1207, + "DiskUsage_RAW": 45562, + "RemoteSysCpu": 7.0, + "RemoteUserCpu": 40.3, + "CPUEfficiency": 4.7, + "Inputfiles": { + "a.root": { + "filesize": 25000, + "usedsize": 20000 + }, + "b.root": { + "filesize": 25000, + "usedsize": 20000 + }, + "c.root": { + "filesize": 25000, + "usedsize": 20000 + } + } + } +] diff --git a/lapis_tests/job_io/test_htcondor.py b/lapis_tests/job_io/test_htcondor.py index 3e110f5..2afe888 100644 --- a/lapis_tests/job_io/test_htcondor.py +++ b/lapis_tests/job_io/test_htcondor.py @@ -19,3 +19,22 @@ def test_simple_read(self): # ensure that one job was removed by importer (wrong walltime given) lines = sum(1 for _ in input_file) assert jobs == (lines - 2) + + def test_read_with_inputfiles(self): + with open( + os.path.join( + os.path.dirname(__file__), "..", "data", "job_list_minimal.json" + ) + ) as input_file: + print( + os.path.join( + os.path.dirname(__file__), "..", "data", "job_list_minimal.json" + ) + ) + jobs = 0 + # lines = sum(1 for _ in input_file) + for job in htcondor_job_reader(input_file): + assert job is not None + jobs += 1 + assert jobs > 0 + # assert jobs == (lines - 1) From f08b698987546232086212d28dc2692fd5b05c16 Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Sat, 2 Nov 2019 11:24:12 +0100 Subject: [PATCH 282/648] updated test_htcondor unit test --- lapis_tests/data/job_list_minimal.json | 2 +- lapis_tests/job_io/test_htcondor.py | 24 ++++++++++++++++-------- 2 files changed, 17 insertions(+), 9 deletions(-) diff --git a/lapis_tests/data/job_list_minimal.json b/lapis_tests/data/job_list_minimal.json index 726f76a..fc31d03 100644 --- a/lapis_tests/data/job_list_minimal.json +++ b/lapis_tests/data/job_list_minimal.json @@ -88,7 +88,7 @@ "RequestWalltime": 60, "RequestMemory": 2000, "RequestDisk": 6000000, - "RemoteWallClockTime": 100.0, + "RemoteWallClockTime": -100.0, "'Number of Allocated Processors'": 1, "MemoryUsage": 1207, "DiskUsage_RAW": 45562, diff --git a/lapis_tests/job_io/test_htcondor.py b/lapis_tests/job_io/test_htcondor.py index 2afe888..4460271 100644 --- a/lapis_tests/job_io/test_htcondor.py +++ b/lapis_tests/job_io/test_htcondor.py @@ -1,5 +1,5 @@ import os - +import json from lapis.job_io.htcondor import htcondor_job_reader @@ -26,15 +26,23 @@ def test_read_with_inputfiles(self): os.path.dirname(__file__), "..", "data", "job_list_minimal.json" ) ) as input_file: - print( - os.path.join( - os.path.dirname(__file__), "..", "data", "job_list_minimal.json" - ) - ) jobs = 0 - # lines = sum(1 for _ in input_file) for job in htcondor_job_reader(input_file): + print(job.walltime) assert job is not None jobs += 1 + if "inputfiles" in job.resources.keys(): + assert "filesize" in job.resources["inputfiles"].keys() + if "inputfiles" in job.used_resources.keys(): + assert "usedsize" in job.used_resources["inputfiles"].keys() + assert jobs > 0 - # assert jobs == (lines - 1) + + with open( + os.path.join( + os.path.dirname(__file__), "..", "data", "job_list_minimal.json" + ) + ) as input_file: + readout = json.load(input_file) + lines = sum(1 for _ in readout) + assert jobs == (lines - 1) From d5ec1239fd3b0c40526984cf6af7015851a4cca3 Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Sat, 2 Nov 2019 15:00:33 +0100 Subject: [PATCH 283/648] set input file usedsize to filesize if no usedsize is given --- lapis/job_io/htcondor.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/lapis/job_io/htcondor.py b/lapis/job_io/htcondor.py index e5adcc1..a1763f4 100644 --- a/lapis/job_io/htcondor.py +++ b/lapis/job_io/htcondor.py @@ -77,7 +77,12 @@ def htcondor_job_reader( if "usedsize" in filespecs: del resources["inputfiles"][filename]["usedsize"] if "filesize" in filespecs: + if "usedsize" not in filespecs: + used_resources["inputfiles"][filename]["usedsize"] = filespecs[ + "filesize" + ] del used_resources["inputfiles"][filename]["filesize"] + except KeyError: pass yield Job( From 9b56e5bc93618eafc3c4904ece60ce7f889705cd Mon Sep 17 00:00:00 2001 From: tfesenbecker <50665055+tfesenbecker@users.noreply.github.com> Date: Sat, 2 Nov 2019 15:39:03 +0100 Subject: [PATCH 284/648] Update lapis_tests/job_io/test_htcondor.py Co-Authored-By: Eileen Kuehn --- lapis_tests/job_io/test_htcondor.py | 1 - 1 file changed, 1 deletion(-) diff --git a/lapis_tests/job_io/test_htcondor.py b/lapis_tests/job_io/test_htcondor.py index 4460271..d86e6c3 100644 --- a/lapis_tests/job_io/test_htcondor.py +++ b/lapis_tests/job_io/test_htcondor.py @@ -28,7 +28,6 @@ def test_read_with_inputfiles(self): ) as input_file: jobs = 0 for job in htcondor_job_reader(input_file): - print(job.walltime) assert job is not None jobs += 1 if "inputfiles" in job.resources.keys(): From b08bb1a10a83d0bff15bd40b8406e0986842cf21 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Mon, 4 Nov 2019 16:20:27 +0100 Subject: [PATCH 285/648] added file for glossary --- docs/source/glossary.rst | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 docs/source/glossary.rst diff --git a/docs/source/glossary.rst b/docs/source/glossary.rst new file mode 100644 index 0000000..7d1177d --- /dev/null +++ b/docs/source/glossary.rst @@ -0,0 +1,6 @@ +Glossary of Terms +================= + +.. warning:: + + Nothing defined in the glossary yet. From a3fdaa2d0341c1b583a88ee04eb0dd373f92678e Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Mon, 4 Nov 2019 16:22:10 +0100 Subject: [PATCH 286/648] fixed config for documentation --- docs/conf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/conf.py b/docs/conf.py index 62a86a1..ed717a7 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -17,9 +17,9 @@ # sys.path.insert(0, os.path.abspath('.')) import os import sys -import lapis sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))) +import lapis # noqa: E402 # -- Project information ----------------------------------------------------- From ff2540a73ab3d27d7806612fb5bb1d719eb96257 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Mon, 4 Nov 2019 16:22:31 +0100 Subject: [PATCH 287/648] added general description about lapis --- docs/index.rst | 41 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 40 insertions(+), 1 deletion(-) diff --git a/docs/index.rst b/docs/index.rst index a2d7a44..d565ccc 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -11,7 +11,41 @@ LAPIS -- Simulations for Opportunistic Resources :caption: Contents: source/topics/overview - source/api/modules + source/glossary + +The LAPIS simulator enables the simulation of job execution and scheduling with +a focus on opportunistic resources. The scheduling internally builds on concepts +from `HTCondor`_. The opportunistic resources are managed building on the projects +`TARDIS`_ and `COBalD`_. +The simulation builds on importing well-established input formats to generate +the jobs and set up the infrastructure either in an opportunistic or +classical fashion. + +Simple Command Line Interface +----------------------------- + +Although LAPIS is written to provide an extensive framework for setting up +advanced simulation, it also provides a simple command line interface to get you +started quickly. + +You have the options to start in 1) static, 2) dynamic as well as 3) hybrid +mode enabling you to compare the various simulation outputs. + +.. code-block:: bash + + python cli/simulate.py --log-file - static --job-file swf \ + --pool-file htcondor + +As you can see from the example, you can even mix and match different input +formats to create your required simulation environment. An extensive documentation +about the CLI can be found in the :doc:`source/topics/cli` chapter. + +Simple Framework for Advanced Use Cases +--------------------------------------- + +The implementation of the simulation itself builds on the lightweight simulation +framework `μSim`_. Due to the human-centric API of μSim, it is a charm to actually +read and extend the simulation for adaptation to various use cases. Indices and tables ================== @@ -19,3 +53,8 @@ Indices and tables * :ref:`genindex` * :ref:`modindex` * :ref:`search` + +.. _HTCondor: https://research.cs.wisc.edu/htcondor/ +.. _COBalD: https://cobald.readthedocs.io/en/latest/ +.. _TARDIS: https://cobald-tardis.readthedocs.io/en/latest +.. _μSim: https://usim.readthedocs.io/en/latest/ From 7a00e6e762f7a89b98e1c155b6ba9f66fdb69f2d Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Mon, 4 Nov 2019 17:09:51 +0100 Subject: [PATCH 288/648] introduced the first terms in the glossary --- docs/index.rst | 7 +++-- docs/source/glossary.rst | 47 +++++++++++++++++++++++++++++-- docs/source/topics/monitoring.rst | 4 +-- 3 files changed, 51 insertions(+), 7 deletions(-) diff --git a/docs/index.rst b/docs/index.rst index d565ccc..1a92f96 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -14,9 +14,10 @@ LAPIS -- Simulations for Opportunistic Resources source/glossary The LAPIS simulator enables the simulation of job execution and scheduling with -a focus on opportunistic resources. The scheduling internally builds on concepts -from `HTCondor`_. The opportunistic resources are managed building on the projects -`TARDIS`_ and `COBalD`_. +a focus on :term:`opportunistic resources `. The +scheduling internally builds on concepts from `HTCondor`_. The +:term:`opportunistic resources ` are managed building on +the projects `TARDIS`_ and `COBalD`_. The simulation builds on importing well-established input formats to generate the jobs and set up the infrastructure either in an opportunistic or classical fashion. diff --git a/docs/source/glossary.rst b/docs/source/glossary.rst index 7d1177d..1e6ac30 100644 --- a/docs/source/glossary.rst +++ b/docs/source/glossary.rst @@ -1,6 +1,49 @@ Glossary of Terms ================= -.. warning:: +.. Using references in the glossary itself: + When mentioning other items, always reference them. + When mentioning the current item, never reference it. - Nothing defined in the glossary yet. +.. glossary:: + + Allocation + Information about the amount of resources being acquired by a :term:`drone` + without evaluating the effectiveness of use. + + .. note:: + + The general concept of allocation and :term:`utilisation` is introduced + by COBalD to internally decide about which resources to integrate or + disintegrate. + + Drone + Partitionable placeholder for jobs. In the current state of LAPIS a drone + represents a single worker node provided by a specific :term:`pool`. + + .. note:: + + The concept of drones is introduced by TARDIS. Drones integrate + themselves into an HTCondor overlay batch system and thereby provision + the resources for jobs. They act nearly autonomously to e.g. manage + shutdown and error handling if required. + + Opportunistic Resource + Any resources not permanently dedicated to but temporarily available for + a specific task, user, or group. + + Pool + A collection of indistinguishable resources. This means that a pool + defines the number of worker nodes having a specific combination of + available resources e.g. number of cores, memory, or disk. A resource + provider can provide a number of pools. + + Utilisation + Information about the effectiveness of use of resources acquired by a + :term:`drone`. + + .. note:: + + The general concept of :term:`allocation` and utilisation is introduced + by COBalD to internally decide about which resources to integrate or + disintegrate. diff --git a/docs/source/topics/monitoring.rst b/docs/source/topics/monitoring.rst index 3673823..dfc30fa 100644 --- a/docs/source/topics/monitoring.rst +++ b/docs/source/topics/monitoring.rst @@ -2,8 +2,8 @@ Monitoring Simulation Data ========================== Lapis provides some predefined functions that provide monitoring of relevant -information about your pools, resources, and jobs. Further, information -relevant to COBalD are provided. +information about your :term:`pools `, resources, and jobs. Further, +information relevant to COBalD are provided. In the following you find tables summarising the available information. From e0676009611a3332420c3f40f020c3cc6391eb81 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Tue, 5 Nov 2019 15:03:35 +0100 Subject: [PATCH 289/648] added dependency to contentui sphinx package --- docs/conf.py | 1 + pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/conf.py b/docs/conf.py index ed717a7..8ab5fa0 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -49,6 +49,7 @@ "sphinx.ext.todo", "sphinx.ext.imgmath", "sphinx.ext.viewcode", + "sphinxcontrib.contentui", ] # Add any paths that contain templates here, relative to this directory. diff --git a/pyproject.toml b/pyproject.toml index 9b22d0f..56fbbee 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -37,7 +37,7 @@ test = [ "flake8-bugbear", "black; implementation_name=='cpython'", ] -doc = ["sphinx", "sphinx_rtd_theme"] +doc = ["sphinx", "sphinx_rtd_theme", "sphinxcontrib-contentui"] dev = ["pre-commit"] [tool.flit.metadata.urls] From a01feb94c784b0cc34803f6d6a791c8922f63e76 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Tue, 5 Nov 2019 15:03:59 +0100 Subject: [PATCH 290/648] introduced tabs for displaying the different execution modes of lapis --- docs/index.rst | 42 +++++++++++++++++++++++++++++++++++++++--- 1 file changed, 39 insertions(+), 3 deletions(-) diff --git a/docs/index.rst b/docs/index.rst index 1a92f96..32e93a8 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -32,10 +32,46 @@ started quickly. You have the options to start in 1) static, 2) dynamic as well as 3) hybrid mode enabling you to compare the various simulation outputs. -.. code-block:: bash +.. content-tabs:: - python cli/simulate.py --log-file - static --job-file swf \ - --pool-file htcondor + .. tab-container:: static + :title: Static + + The *static* environment provides a classical setup where all resources + are available exclusively for processing the jobs for the whole runtime + of the simulation. + + .. code-block:: bash + + python cli/simulate.py --log-file - static --job-file swf \ + --pool-file htcondor + + .. tab-container:: dynamic + :title: Dynamic + + The *dynamic* environment builds on volatile, opportunistic resources + exclusively. Based on the amount of jobs being processed within the + simulation COBalD controllers decide about the integration and + disintegration of resources. + + .. code-block:: bash + + python cli/simulate.py --log-file - dynamic --job-file swf \ + --pool-file htcondor + + .. tab-container:: hybrid + :title: Hybrid + + The *hybrid* simulation environment provides a baseline of static resources + that are available for the whole runtime of the simulation. These static + resources are dynamically complemented with volatile, opportunistic + resources based on current job pressure. + + .. code-block:: bash + + python cli/simulate.py --log-file - hybrid --job-file swf \ + --static-pool-file htcondor \ + --dynamic-pool-file htcondor As you can see from the example, you can even mix and match different input formats to create your required simulation environment. An extensive documentation From ced5eea52eac22b5c0c0942c353dfd742bfb7c86 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Tue, 5 Nov 2019 15:58:11 +0100 Subject: [PATCH 291/648] fixed naming of doc requirement --- .readthedocs.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.readthedocs.yml b/.readthedocs.yml index 076647a..da28101 100644 --- a/.readthedocs.yml +++ b/.readthedocs.yml @@ -6,7 +6,7 @@ python: - method: pip path: . extra_requirements: - - docs + - doc sphinx: builder: html From 4edea2a05def95cd49204c85873b8bbf3f5f8da3 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Wed, 6 Nov 2019 21:29:22 +0100 Subject: [PATCH 292/648] added page for overall lapis concept --- docs/source/topics/concept.rst | 2 ++ docs/source/topics/overview.rst | 1 + 2 files changed, 3 insertions(+) create mode 100644 docs/source/topics/concept.rst diff --git a/docs/source/topics/concept.rst b/docs/source/topics/concept.rst new file mode 100644 index 0000000..e555972 --- /dev/null +++ b/docs/source/topics/concept.rst @@ -0,0 +1,2 @@ +Simulation Concept +================== diff --git a/docs/source/topics/overview.rst b/docs/source/topics/overview.rst index 75232dc..f00d705 100644 --- a/docs/source/topics/overview.rst +++ b/docs/source/topics/overview.rst @@ -6,6 +6,7 @@ This is a collection of separate topics on LAPIS. .. toctree:: :maxdepth: 1 + concept cli support monitoring From b9c0237f10cce3eab17f50f02b8681220b4cc0f6 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Wed, 6 Nov 2019 21:30:26 +0100 Subject: [PATCH 293/648] added documentation contents from tabea, closes #54 --- docs/source/topics/support.rst | 90 ++++++++++++++++++++++++++++++++++ 1 file changed, 90 insertions(+) diff --git a/docs/source/topics/support.rst b/docs/source/topics/support.rst index da75f3c..52f08b7 100644 --- a/docs/source/topics/support.rst +++ b/docs/source/topics/support.rst @@ -4,8 +4,98 @@ Supported File Formats TARDIS ------ +.. warning:: + + Import of TARDIS configuration files not supported yet, but will be + available in the future. + HTCondor -------- +Job Imports +~~~~~~~~~~~ + +Jobs can be created directly from HTCondor outputs. Via the ``condor_history`` +command from HTCondor, ClassAds describing a jobs requested and used resources +can be gathered and saved to a csv or json file. +To sufficiently describe a job for the simulation information about requested +and used resources should be included in the export: + +requested resources: + RequestCpus, RequestWalltime, RequestMemory, RequestDisk + +used resources: + RemoteWallClockTime, MemoryUsage, DiskUsage_RAW, RemoteSysCpu, RemoteUserCpu + +additional job information: + QDate + +If csv is chosen as input file format every line represents a job, columns +should be separated by spaces, comments should be marked by simple quotation +marks. + +If information about a jobs input files are passed to lapis a json file should +contain job descriptions because this file format allows for nested structures. +In this case the json file should contain an array of objects, each representing +a job. + +ClassAds containing information about a jobs input files are not part of a jobs +standard ClassAds in HTCondor but can be extracted via external tools (e.g. job +submission tools) and stored as Inputfiles ClassAd. +Alternatively this information can be added to the job input file manually. + +The ``Inputfile`` ClassAd contains dictionary with the input file names serving +as keys and subdictionaries further describing the input files. +These subdictionaries provide + +filesize: + the files total size in MB + +usedsize: + the amount of data the job actually reads from this file in MB + +.. code-block:: csv + + TODO + +.. code-block:: json + + [ + { + "QDate": 1567169672, + "RequestCpus": 1, + "RequestWalltime": 60, + "RequestMemory": 2000, + "RequestDisk": 6000000, + "RemoteWallClockTime": 100.0, + "MemoryUsage": 2867, + "DiskUsage_RAW": 41898, + "RemoteSysCpu": 10.0, + "RemoteUserCpu": 40.0, + }, + { + "QDate": 1567169672, + "RequestCpus": 1, + "RequestWalltime": 60, + "RequestMemory": 2000, + "RequestDisk": 6000000, + "RemoteWallClockTime": 100.0, + "MemoryUsage": 2867, + "DiskUsage_RAW": 41898, + "RemoteSysCpu": 10.0, + "RemoteUserCpu": 40.0, + "Inputfiles": { + "a.root": { + "filesize": 25000, + "usedsize": 20000 + }, + "b.root": { + "filesize": 25000, + "usedsize": 20000 + } + } + } + ] + SWF Format ---------- From ebec83e90d4b85141a9db6664251e2cbc76f4278 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Wed, 6 Nov 2019 21:53:14 +0100 Subject: [PATCH 294/648] removed field from htcondor import that is not standardised --- lapis/job_io/htcondor.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/lapis/job_io/htcondor.py b/lapis/job_io/htcondor.py index a1763f4..6e878c4 100644 --- a/lapis/job_io/htcondor.py +++ b/lapis/job_io/htcondor.py @@ -17,7 +17,6 @@ def htcondor_job_reader( used_resource_name_mapping={ # noqa: B006 "queuetime": "QDate", "walltime": "RemoteWallClockTime", # s - "cores": "Number of Allocated Processors", "memory": "MemoryUsage", # MB "disk": "DiskUsage_RAW", # KiB }, @@ -28,7 +27,6 @@ def htcondor_job_reader( "RequestDisk": 1.024 / 1024 / 1024, "queuetime": 1, "RemoteWallClockTime": 1, - "Number of Allocated Processors": 1, "MemoryUsage": 1 / 1024, "DiskUsage_RAW": 1.024 / 1024 / 1024, }, From d056f904def605881c10b16345146075338f1534 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Wed, 6 Nov 2019 21:54:29 +0100 Subject: [PATCH 295/648] corrected units to bytes and introduced them as int type --- lapis/job_io/htcondor.py | 24 +++++++++++++----------- lapis/job_io/swf.py | 34 ++++++++++++++++++++-------------- 2 files changed, 33 insertions(+), 25 deletions(-) diff --git a/lapis/job_io/htcondor.py b/lapis/job_io/htcondor.py index 6e878c4..f891db7 100644 --- a/lapis/job_io/htcondor.py +++ b/lapis/job_io/htcondor.py @@ -23,12 +23,12 @@ def htcondor_job_reader( unit_conversion_mapping={ # noqa: B006 "RequestCpus": 1, "RequestWalltime": 1, - "RequestMemory": 1.024 / 1024, - "RequestDisk": 1.024 / 1024 / 1024, + "RequestMemory": 1024 * 1024, + "RequestDisk": 1024, "queuetime": 1, "RemoteWallClockTime": 1, - "MemoryUsage": 1 / 1024, - "DiskUsage_RAW": 1.024 / 1024 / 1024, + "MemoryUsage": 1 / 1.048576 * 1024 * 1024, + "DiskUsage_RAW": 1024, }, ): input_file_type = iterable.name.split(".")[-1].lower() @@ -49,9 +49,10 @@ def htcondor_job_reader( resources = {} for key, original_key in resource_name_mapping.items(): try: - resources[key] = float( - entry[original_key] - ) * unit_conversion_mapping.get(original_key, 1) + resources[key] = int( + float(entry[original_key]) + * unit_conversion_mapping.get(original_key, 1) + ) except ValueError: pass @@ -60,13 +61,14 @@ def htcondor_job_reader( (float(entry["RemoteSysCpu"]) + float(entry["RemoteUserCpu"])) / float(entry[used_resource_name_mapping["walltime"]]) ) - * unit_conversion_mapping.get(used_resource_name_mapping["cores"], 1) + * unit_conversion_mapping.get(resource_name_mapping["cores"], 1) } for key in ["memory", "walltime", "disk"]: original_key = used_resource_name_mapping[key] - used_resources[key] = float( - entry[original_key] - ) * unit_conversion_mapping.get(original_key, 1) + used_resources[key] = int( + float(entry[original_key]) + * unit_conversion_mapping.get(original_key, 1) + ) try: resources["inputfiles"] = deepcopy(entry["Inputfiles"]) diff --git a/lapis/job_io/swf.py b/lapis/job_io/swf.py index baf45df..99124bd 100644 --- a/lapis/job_io/swf.py +++ b/lapis/job_io/swf.py @@ -12,18 +12,18 @@ def swf_job_reader( iterable, resource_name_mapping={ # noqa: B006 "cores": "Requested Number of Processors", - "walltime": "Requested Time", - "memory": "Requested Memory", + "walltime": "Requested Time", # s + "memory": "Requested Memory", # KiB }, used_resource_name_mapping={ # noqa: B006 - "walltime": "Run Time", + "walltime": "Run Time", # s "cores": "Number of Allocated Processors", - "memory": "Used Memory", + "memory": "Used Memory", # KiB "queuetime": "Submit Time", }, unit_conversion_mapping={ # noqa: B006 - "Used Memory": 1 / 1024 / 1024, - "Requested Memory": 1 / 2114 / 1024, + "Used Memory": 1024, + "Requested Memory": 1024, }, ): header = { @@ -71,14 +71,20 @@ def swf_job_reader( ) # handle memory key = "memory" - resources[key] = ( - float(row[header[resource_name_mapping[key]]]) - * float(row[header[resource_name_mapping["cores"]]]) - ) * unit_conversion_mapping.get(resource_name_mapping[key], 1) - used_resources[key] = ( - float(row[header[used_resource_name_mapping[key]]]) - * float(row[header[used_resource_name_mapping["cores"]]]) - ) * unit_conversion_mapping.get(used_resource_name_mapping[key], 1) + resources[key] = int( + ( + float(row[header[resource_name_mapping[key]]]) + * float(row[header[resource_name_mapping["cores"]]]) + ) + * unit_conversion_mapping.get(resource_name_mapping[key], 1) + ) + used_resources[key] = int( + ( + float(row[header[used_resource_name_mapping[key]]]) + * float(row[header[used_resource_name_mapping["cores"]]]) + ) + * unit_conversion_mapping.get(used_resource_name_mapping[key], 1) + ) yield Job( resources=resources, used_resources=used_resources, From 6f71d7f6fe94f99df63f013aa22f1b9ab0d0b261 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Thu, 7 Nov 2019 06:41:30 +0100 Subject: [PATCH 296/648] Update lapis/job_io/htcondor.py Co-Authored-By: Max Fischer --- lapis/job_io/htcondor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapis/job_io/htcondor.py b/lapis/job_io/htcondor.py index f891db7..60a3684 100644 --- a/lapis/job_io/htcondor.py +++ b/lapis/job_io/htcondor.py @@ -27,7 +27,7 @@ def htcondor_job_reader( "RequestDisk": 1024, "queuetime": 1, "RemoteWallClockTime": 1, - "MemoryUsage": 1 / 1.048576 * 1024 * 1024, + "MemoryUsage": 1000 * 1000, "DiskUsage_RAW": 1024, }, ): From bc556a72f8130960c6707cd451b837790a786ea3 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Thu, 7 Nov 2019 06:50:29 +0100 Subject: [PATCH 297/648] added unit conversion also for pool imports --- lapis/pool_io/htcondor.py | 6 +++--- lapis/pool_io/machines.py | 6 +++++- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/lapis/pool_io/htcondor.py b/lapis/pool_io/htcondor.py index 314eb1e..0dba5c1 100644 --- a/lapis/pool_io/htcondor.py +++ b/lapis/pool_io/htcondor.py @@ -14,8 +14,8 @@ def htcondor_pool_reader( }, unit_conversion_mapping: dict = { # noqa: B006 "TotalSlotCPUs": 1, - "TotalSlotDisk": 1.024 / 1024, - "TotalSlotMemory": 1.024 / 1024, + "TotalSlotDisk": 1024 * 1024, + "TotalSlotMemory": 1024 * 1024, }, pool_type: Callable = Pool, make_drone: Callable = None, @@ -44,7 +44,7 @@ def htcondor_pool_reader( make_drone=partial( make_drone, { - key: float(row[value]) * unit_conversion_mapping.get(value, 1) + key: int(float(row[value]) * unit_conversion_mapping.get(value, 1)) for key, value in resource_name_mapping.items() }, ignore_resources=["disk"], diff --git a/lapis/pool_io/machines.py b/lapis/pool_io/machines.py index 30a2488..38e0e94 100644 --- a/lapis/pool_io/machines.py +++ b/lapis/pool_io/machines.py @@ -11,6 +11,10 @@ def machines_pool_reader( "cores": "CPUs_per_node", "memory": "RAM_per_node_in_KB", }, + unit_conversion_mapping={ # noqa: B006 + "CPUs_per_node": 1, + "RAM_per_node_in_KB": 1000, + }, pool_type: Callable = Pool, make_drone: Callable = None, ): @@ -33,7 +37,7 @@ def machines_pool_reader( make_drone=partial( make_drone, { - key: float(row[value]) + key: int(float(row[value]) * unit_conversion_mapping.get(value, 1)) for key, value in resource_name_mapping.items() }, ), From ecdfc51b63fd9a491aead99aa4350e210df01221 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Thu, 7 Nov 2019 21:13:50 +0100 Subject: [PATCH 298/648] adapted type hints for monitoring functions --- lapis/monitor/cobald.py | 5 +++-- lapis/monitor/general.py | 14 +++++++------- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/lapis/monitor/cobald.py b/lapis/monitor/cobald.py index c609d30..21dfee0 100644 --- a/lapis/monitor/cobald.py +++ b/lapis/monitor/cobald.py @@ -2,13 +2,14 @@ from cobald.monitor.format_json import JsonFormatter from cobald.monitor.format_line import LineProtocolFormatter +from typing import List, Dict from lapis.drone import Drone from lapis.monitor import LoggingSocketHandler, LoggingUDPSocketHandler from lapis.pool import Pool -def drone_statistics(drone: Drone) -> list: +def drone_statistics(drone: Drone) -> List[Dict]: """ Collect allocation, utilisation, demand and supply of drones. @@ -41,7 +42,7 @@ def drone_statistics(drone: Drone) -> list: } -def pool_statistics(pool: Pool) -> list: +def pool_statistics(pool: Pool) -> List[Dict]: """ Collect allocation, utilisation, demand and supply of pools. diff --git a/lapis/monitor/general.py b/lapis/monitor/general.py index cb9136b..be6d24d 100644 --- a/lapis/monitor/general.py +++ b/lapis/monitor/general.py @@ -1,4 +1,4 @@ -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, List, Dict import logging.handlers @@ -15,7 +15,7 @@ from lapis.simulator import Simulator -def resource_statistics(drone: Drone) -> list: +def resource_statistics(drone: Drone) -> List[Dict]: """ Log ratio of used and requested resources for drones. @@ -53,7 +53,7 @@ def resource_statistics(drone: Drone) -> list: } -def user_demand(job_queue: JobQueue) -> list: +def user_demand(job_queue: JobQueue) -> List[Dict]: """ Log global user demand. @@ -75,7 +75,7 @@ def user_demand(job_queue: JobQueue) -> list: } -def job_statistics(scheduler: CondorJobScheduler) -> list: +def job_statistics(scheduler: CondorJobScheduler) -> List[Dict]: """ Log number of jobs running in all drones. @@ -113,7 +113,7 @@ def job_statistics(scheduler: CondorJobScheduler) -> list: } -def job_events(job: Job) -> list: +def job_events(job: Job) -> List[Dict]: """ Log relevant events for jobs. Relevant events are @@ -175,7 +175,7 @@ def job_events(job: Job) -> list: } -def pool_status(pool: Pool) -> list: +def pool_status(pool: Pool) -> List[Dict]: """ Log state changes of pools and drones. @@ -197,7 +197,7 @@ def pool_status(pool: Pool) -> list: } -def configuration_information(simulator: "Simulator") -> list: +def configuration_information(simulator: "Simulator") -> List[Dict]: """ Log information how pools and drones are configured, e.g. provided resources. From c3d57dac5d54c5cb31a4528e7cb6d5999660ea63 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Thu, 7 Nov 2019 22:20:37 +0100 Subject: [PATCH 299/648] added first information for logging --- docs/source/topics/monitoring.rst | 63 ++++++++++++++++++++++++++++--- 1 file changed, 57 insertions(+), 6 deletions(-) diff --git a/docs/source/topics/monitoring.rst b/docs/source/topics/monitoring.rst index dfc30fa..9e617da 100644 --- a/docs/source/topics/monitoring.rst +++ b/docs/source/topics/monitoring.rst @@ -1,19 +1,70 @@ Monitoring Simulation Data ========================== +Monitoring information is critical information in simulations. However, the +monitoring overhead can be significant. For this reason, LAPIS provides an object-based +monitoring. Whenever a monitoring-relevant object does change during simulation +the object is put into a monitoring :py:class:`usim.Queue` for further processing. + +When running a simulation you should register your required logging callable +with the monitoring component. There is already a number of predefined logging +callables that can easily be used, see :ref:`predefined_monitoring_functions`. +Each of these logging functions is parameterised with the objects it is able to +process. Whenever an object becomes available in the monitoring queue, it is +checked if matching logging callables have been registered to handle the specific +object. The monitoring itself runs asynchronously: Whenever elements become +available in the monitoring queue, the logging process starts. + +If you want to define your own logging callable that for example logs information +about changes to a drone it should follow the following format: + +.. code-block:: python3 + + def log_object(the_object: Drone) -> List[Dict]: + return [] + log_object.name: str = "identifying_name" + log_object.whitelist: Tuple = (Drone,) + log_object.logging_formatter: Dict = { + LoggingSocketHandler.__name__: JsonFormatter(), + } + +Information about the object types being processed by your callable is given as a +:py:class:`tuple` in :py:attr:`whitelist`. You further need to set an identifying +:py:attr:`name` for your callable as well as :py:class:`logging.Formatter` for +specific logging options. + +LAPIS currently supports logging to + +* TCP, +* File, and/or +* Telegraf. + +See :doc:`cli` for details on how to utilise the different logging options. + +.. _predefined_monitoring_functions: + +Predefined Monitoring Functions +------------------------------- + Lapis provides some predefined functions that provide monitoring of relevant information about your :term:`pools `, resources, and jobs. Further, information relevant to COBalD are provided. -In the following you find tables summarising the available information. +General Monitoring +~~~~~~~~~~~~~~~~~~ -The CLI of LAPIS currently supports logging to +.. autofunction:: lapis.monitor.general.resource_statistics +.. autofunction:: lapis.monitor.general.user_demand +.. autofunction:: lapis.monitor.general.job_statistics +.. autofunction:: lapis.monitor.general.job_events +.. autofunction:: lapis.monitor.general.pool_status +.. autofunction:: lapis.monitor.general.configuration_information -* TCP, -* File, or -* Telegraf. +COBalD-specific Monitoring +~~~~~~~~~~~~~~~~~~~~~~~~~~ -See :doc:`cli` for details. +.. autofunction:: lapis.monitor.cobald.drone_statistics +.. autofunction:: lapis.monitor.cobald.pool_statistics Telegraf -------- From 7bb84c896a123449fa61f419be615ba4c2a3b2fb Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Thu, 7 Nov 2019 22:29:02 +0100 Subject: [PATCH 300/648] added information on how to register logging function --- docs/source/topics/monitoring.rst | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/docs/source/topics/monitoring.rst b/docs/source/topics/monitoring.rst index 9e617da..a779774 100644 --- a/docs/source/topics/monitoring.rst +++ b/docs/source/topics/monitoring.rst @@ -33,6 +33,14 @@ Information about the object types being processed by your callable is given as :py:attr:`name` for your callable as well as :py:class:`logging.Formatter` for specific logging options. +Registering your logging callable is very easy then, you just need to call + +.. code-block:: python3 + + simulator.monitoring.register_statistic(log_object) + +That's it! + LAPIS currently supports logging to * TCP, From 2d979a97bdb80222d462afb96ac3eba55d805172 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Thu, 7 Nov 2019 22:41:22 +0100 Subject: [PATCH 301/648] added dependency to sphinx-click --- docs/conf.py | 1 + pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/conf.py b/docs/conf.py index 8ab5fa0..b43e6de 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -50,6 +50,7 @@ "sphinx.ext.imgmath", "sphinx.ext.viewcode", "sphinxcontrib.contentui", + "sphinx_click.ext", ] # Add any paths that contain templates here, relative to this directory. diff --git a/pyproject.toml b/pyproject.toml index 56fbbee..5dc1fbd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -37,7 +37,7 @@ test = [ "flake8-bugbear", "black; implementation_name=='cpython'", ] -doc = ["sphinx", "sphinx_rtd_theme", "sphinxcontrib-contentui"] +doc = ["sphinx", "sphinx_rtd_theme", "sphinxcontrib-contentui", "sphinx-click"] dev = ["pre-commit"] [tool.flit.metadata.urls] From 2ea790c39a0bf35568cfd469bcb7683807996be9 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Thu, 7 Nov 2019 22:41:50 +0100 Subject: [PATCH 302/648] added documentation via sphinx-click, closes #52 --- docs/source/topics/cli.rst | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/docs/source/topics/cli.rst b/docs/source/topics/cli.rst index a5281dc..a864aad 100644 --- a/docs/source/topics/cli.rst +++ b/docs/source/topics/cli.rst @@ -1,2 +1,6 @@ Command Line Interface ====================== + +.. click:: lapis.cli.simulate:cli + :prog: simulate + :show-nested: From d38b709ad3df672f52e9efb8890e0e75dc15e68a Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Tue, 12 Nov 2019 17:51:57 +0100 Subject: [PATCH 303/648] introduced job queue in drones to make drone independent from scheduler, fixes #60 --- lapis/drone.py | 11 +++++++++-- lapis/pool.py | 11 +++++++---- lapis/scheduler.py | 2 +- lapis_tests/test_job.py | 18 +++++++++++------- 4 files changed, 28 insertions(+), 14 deletions(-) diff --git a/lapis/drone.py b/lapis/drone.py index b6d9cab..2226fbe 100644 --- a/lapis/drone.py +++ b/lapis/drone.py @@ -1,5 +1,5 @@ from cobald import interfaces -from usim import time, Scope, instant, Capacities, ResourcesUnavailable +from usim import time, Scope, instant, Capacities, ResourcesUnavailable, Queue from lapis.job import Job @@ -44,6 +44,7 @@ def __init__( self.jobs = 0 self._allocation = None self._utilisation = None + self._job_queue = Queue() @property def theoretical_available_resources(self): @@ -60,6 +61,9 @@ async def run(self): self._supply = 1 self.scheduler.register_drone(self) await sampling_required.put(self) + async with Scope() as scope: + async for job, kill in self._job_queue: + scope.do(self._run_job(job=job, kill=kill)) @property def supply(self) -> float: @@ -103,7 +107,10 @@ async def shutdown(self): await sampling_required.put(self) # TODO: introduce state of drone await (time + 1) - async def start_job(self, job: Job, kill: bool = False): + async def schedule_job(self, job: Job, kill: bool = False): + await self._job_queue.put((job, kill)) + + async def _run_job(self, job: Job, kill: bool): """ Method manages to start a job in the context of the given drone. The job is started independent of available resources. If resources of diff --git a/lapis/pool.py b/lapis/pool.py index 928b710..2a3e465 100644 --- a/lapis/pool.py +++ b/lapis/pool.py @@ -29,20 +29,21 @@ def __init__( assert init <= capacity self.make_drone = make_drone self._drones = [] - self.init_pool(init=init) self._demand = 1 self._level = init self._capacity = capacity self._name = name - def init_pool(self, init: int = 0): + async def init_pool(self, scope: Scope, init: int = 0): """ Initialisation of existing drones at creation time of pool. :param init: Number of drones to create. """ for _ in range(init): - self._drones.append(self.make_drone(0)) + drone = self.make_drone(0) + scope.do(drone.run()) + self._drones.append(drone) # TODO: the run method currently needs to be called manually async def run(self): @@ -52,6 +53,7 @@ async def run(self): initialising new drones. Otherwise drones get removed. """ async with Scope() as scope: + await self.init_pool(scope=scope, init=self._level) async for _ in interval(1): drones_required = min(self._demand, self._capacity) - self._level while drones_required > 0: @@ -145,5 +147,6 @@ async def run(self): """ Pool runs forever and does not check if number of drones needs to be adapted. """ - while True: + async with Scope() as scope: + await self.init_pool(scope=scope, init=self._level) await eternity diff --git a/lapis/scheduler.py b/lapis/scheduler.py index c3ad63e..ecedc14 100644 --- a/lapis/scheduler.py +++ b/lapis/scheduler.py @@ -90,7 +90,7 @@ async def run(self): for job in self.job_queue: best_match = self._schedule_job(job) if best_match: - scope.do(best_match.start_job(job)) + await best_match.schedule_job(job) self.job_queue.remove(job) await sampling_required.put(self.job_queue) self.unregister_drone(best_match) diff --git a/lapis_tests/test_job.py b/lapis_tests/test_job.py index 13b4090..3c75916 100644 --- a/lapis_tests/test_job.py +++ b/lapis_tests/test_job.py @@ -47,9 +47,10 @@ async def test_job_in_drone(self): pool_resources={"cores": 1, "memory": 1}, scheduling_duration=0, ) + await drone.run() async with Scope() as scope: - scope.do(drone.start_job(job=job)) - assert 10 == time + scope.do(drone.schedule_job(job=job)) + assert 10 == time.now assert 0 == job.waiting_time assert job.successful @@ -65,8 +66,9 @@ async def test_nonmatching_job_in_drone(self): pool_resources={"cores": 1, "memory": 1}, scheduling_duration=0, ) + await drone.run() async with Scope() as scope: - scope.do(drone.start_job(job=job)) + scope.do(drone.schedule_job(job=job)) assert 0 == time assert not job.successful assert 0 == job.waiting_time @@ -87,9 +89,10 @@ async def test_two_nonmatching_jobs(self): pool_resources={"cores": 1, "memory": 1}, scheduling_duration=0, ) + await drone.run() async with Scope() as scope: - scope.do(drone.start_job(job=job_one)) - scope.do(drone.start_job(job=job_two)) + scope.do(drone.schedule_job(job=job_one)) + scope.do(drone.schedule_job(job=job_two)) assert 10 == time assert job_one.successful assert not job_two.successful @@ -112,9 +115,10 @@ async def test_two_matching_jobs(self): pool_resources={"cores": 2, "memory": 2}, scheduling_duration=0, ) + await drone.run() async with Scope() as scope: - scope.do(drone.start_job(job=job_one)) - scope.do(drone.start_job(job=job_two)) + scope.do(drone.schedule_job(job=job_one)) + scope.do(drone.schedule_job(job=job_two)) assert 10 == time assert job_one.successful assert job_two.successful From a16b2aa393eec9de43ed2ef02c57ce0b1c5819dc Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Tue, 12 Nov 2019 17:52:40 +0100 Subject: [PATCH 304/648] enabled resending unsuccessful jobs to scheduler --- lapis/drone.py | 3 +++ lapis/scheduler.py | 3 +++ 2 files changed, 6 insertions(+) diff --git a/lapis/drone.py b/lapis/drone.py index 2226fbe..9826c78 100644 --- a/lapis/drone.py +++ b/lapis/drone.py @@ -154,6 +154,9 @@ async def _run_job(self, job: Job, kill: bool): await instant job_execution.cancel() self.jobs -= 1 + if not job.successful: + job.drone = None + await self.scheduler.retry_job(job) self._utilisation = self._allocation = None self.scheduler.update_drone(self) await sampling_required.put(self) diff --git a/lapis/scheduler.py b/lapis/scheduler.py index ecedc14..7e9a42d 100644 --- a/lapis/scheduler.py +++ b/lapis/scheduler.py @@ -111,6 +111,9 @@ async def _collect_jobs(self): await sampling_required.put(self.job_queue) self._collecting = False + async def retry_job(self, job): + await self._stream_queue.put(job) + def _schedule_job(self, job) -> Drone: priorities = {} for cluster in self.drone_cluster: From b3b719cf65d18120911fa4dac999dc46fa428117 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Wed, 13 Nov 2019 19:12:41 +0100 Subject: [PATCH 305/648] added test to check that jobs are finished when simulation ends --- lapis_tests/test_simulator.py | 40 +++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) create mode 100644 lapis_tests/test_simulator.py diff --git a/lapis_tests/test_simulator.py b/lapis_tests/test_simulator.py new file mode 100644 index 0000000..4875d6b --- /dev/null +++ b/lapis_tests/test_simulator.py @@ -0,0 +1,40 @@ +from tempfile import NamedTemporaryFile + +from lapis.job_io.htcondor import htcondor_job_reader +from lapis.pool import StaticPool +from lapis.pool_io.htcondor import htcondor_pool_reader +from lapis.scheduler import CondorJobScheduler +from lapis.simulator import Simulator + + +class TestSimulator(object): + def test_simulation_exit(self): + simulator = Simulator() + with NamedTemporaryFile(suffix=".csv") as machine_config, NamedTemporaryFile( + suffix=".csv" + ) as job_config: + with open(machine_config.name, "w") as write_stream: + write_stream.write( + "TotalSlotCPUs TotalSlotDisk TotalSlotMemory Count\n" + "1 44624348.0 8000 1" + ) + with open(job_config.name, "w") as write_stream: + write_stream.write( + "QDate RequestCpus RequestWalltime RequestMemory RequestDisk " + "RemoteWallClockTime MemoryUsage DiskUsage_RAW RemoteSysCpu " + "RemoteUserCpu\n" + "1567155456 1 60 2000 6000000 100.0 2867 41898 10.0 40.0" + ) + job_input = open(job_config.name, "r+") + machine_input = open(machine_config.name, "r+") + simulator.create_job_generator( + job_input=job_input, job_reader=htcondor_job_reader + ) + simulator.create_scheduler(scheduler_type=CondorJobScheduler) + simulator.create_pools( + pool_input=machine_input, + pool_reader=htcondor_pool_reader, + pool_type=StaticPool, + ) + simulator.run() + assert 180 == simulator.duration From 5eccefd557632e7b354dcd962c7e5c5008eed4c9 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Wed, 13 Nov 2019 19:14:42 +0100 Subject: [PATCH 306/648] added counter for running jobs to ensure that everything is finished, fixes #64 --- lapis/drone.py | 5 ++--- lapis/scheduler.py | 17 +++++++++++++---- lapis/simulator.py | 4 +++- 3 files changed, 18 insertions(+), 8 deletions(-) diff --git a/lapis/drone.py b/lapis/drone.py index 9826c78..7126242 100644 --- a/lapis/drone.py +++ b/lapis/drone.py @@ -154,9 +154,8 @@ async def _run_job(self, job: Job, kill: bool): await instant job_execution.cancel() self.jobs -= 1 - if not job.successful: - job.drone = None - await self.scheduler.retry_job(job) + job.drone = None + await self.scheduler.job_finished(job) self._utilisation = self._allocation = None self.scheduler.update_drone(self) await sampling_required.put(self) diff --git a/lapis/scheduler.py b/lapis/scheduler.py index 7e9a42d..dc564b9 100644 --- a/lapis/scheduler.py +++ b/lapis/scheduler.py @@ -1,5 +1,5 @@ from typing import Dict -from usim import Scope, interval +from usim import Scope, interval, Resources from lapis.drone import Drone from lapis.monitor import sampling_required @@ -32,6 +32,7 @@ def __init__(self, job_queue): self.interval = 60 self.job_queue = JobQueue() self._collecting = True + self._processing = Resources(jobs=0) @property def drone_list(self): @@ -100,19 +101,27 @@ async def run(self): for key, value in left_resources.items() } self._add_drone(best_match, left_resources) - if not self._collecting and not self.job_queue: + if ( + not self._collecting + and not self.job_queue + and self._processing.levels.jobs == 0 + ): break await sampling_required.put(self) async def _collect_jobs(self): async for job in self._stream_queue: self.job_queue.append(job) + await self._processing.increase(jobs=1) # TODO: logging happens with each job await sampling_required.put(self.job_queue) self._collecting = False - async def retry_job(self, job): - await self._stream_queue.put(job) + async def job_finished(self, job): + if job.successful: + await self._processing.decrease(jobs=1) + else: + await self._stream_queue.put(job) def _schedule_job(self, job) -> Drone: priorities = {} diff --git a/lapis/simulator.py b/lapis/simulator.py index 9689ae2..2920202 100644 --- a/lapis/simulator.py +++ b/lapis/simulator.py @@ -32,6 +32,7 @@ def __init__(self, seed=1234): self.cost = 0 self._job_generators = [] self.monitoring = None + self.duration = None self.enable_monitoring() def enable_monitoring(self): @@ -77,7 +78,8 @@ async def _simulate(self, end): for controller in self.controllers: while_running.do(controller.run(), volatile=True) while_running.do(self.monitoring.run(), volatile=True) - print(f"Finished simulation at {time.now}") + self.duration = time.now + print(f"Finished simulation at {self.duration}") async def _queue_jobs(self, job_input, job_reader): await job_to_queue_scheduler( From 94d0361974d765beb2399282b9d6b18918416d55 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Thu, 21 Nov 2019 14:23:27 +0100 Subject: [PATCH 307/648] added drone parameter to run a job --- lapis/drone.py | 3 +-- lapis/job.py | 6 +++++- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/lapis/drone.py b/lapis/drone.py index 7126242..0e6845a 100644 --- a/lapis/drone.py +++ b/lapis/drone.py @@ -127,7 +127,7 @@ async def _run_job(self, job: Job, kill: bool): self._utilisation = self._allocation = None - job_execution = scope.do(job.run()) + job_execution = scope.do(job.run(self)) self.jobs += 1 try: async with self.resources.claim( @@ -154,7 +154,6 @@ async def _run_job(self, job: Job, kill: bool): await instant job_execution.cancel() self.jobs -= 1 - job.drone = None await self.scheduler.job_finished(job) self._utilisation = self._allocation = None self.scheduler.update_drone(self) diff --git a/lapis/job.py b/lapis/job.py index 934ceee..6d21348 100644 --- a/lapis/job.py +++ b/lapis/job.py @@ -91,18 +91,22 @@ def waiting_time(self) -> float: return self.in_queue_until - self.in_queue_since return float("Inf") - async def run(self): + async def run(self, drone: "Drone"): + self.drone = drone self.in_queue_until = time.now self._success = None await sampling_required.put(self) try: await (time + self.walltime) except CancelTask: + self.drone = None self._success = False except BaseException: + self.drone = None self._success = False raise else: + self.drone = None self._success = True await sampling_required.put(self) From efa8813b181bf787db996fb4460274ff953502bc Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Thu, 21 Nov 2019 14:23:59 +0100 Subject: [PATCH 308/648] drones now only run with a specified drone, closes #55 --- lapis_tests/__init__.py | 4 ++++ lapis_tests/test_job.py | 5 +++-- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/lapis_tests/__init__.py b/lapis_tests/__init__.py index d0c54e8..722b3a2 100644 --- a/lapis_tests/__init__.py +++ b/lapis_tests/__init__.py @@ -43,3 +43,7 @@ def unregister_drone(drone: Drone): @staticmethod def update_drone(drone: Drone): pass + + +class DummyDrone: + pass diff --git a/lapis_tests/test_job.py b/lapis_tests/test_job.py index 3c75916..181bb1a 100644 --- a/lapis_tests/test_job.py +++ b/lapis_tests/test_job.py @@ -3,7 +3,7 @@ from lapis.drone import Drone from lapis.job import Job -from lapis_tests import via_usim, DummyScheduler +from lapis_tests import via_usim, DummyScheduler, DummyDrone class TestJob(object): @@ -27,10 +27,11 @@ def test_name(self): @via_usim async def test_run_job(self): + drone = DummyDrone() job = Job(resources={"walltime": 50}, used_resources={"walltime": 10}) assert float("inf") == job.waiting_time async with Scope() as scope: - scope.do(job.run()) + scope.do(job.run(drone)) assert 10 == time assert 0 == job.waiting_time assert job.successful From 10102fb379fe851b1529e594c31975089f73948c Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Thu, 21 Nov 2019 14:23:59 +0100 Subject: [PATCH 309/648] drones now only run with a specified drone, closes #55 --- lapis/job.py | 1 + lapis_tests/__init__.py | 4 ++++ lapis_tests/test_job.py | 5 +++-- 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/lapis/job.py b/lapis/job.py index 6d21348..c4627e0 100644 --- a/lapis/job.py +++ b/lapis/job.py @@ -92,6 +92,7 @@ def waiting_time(self) -> float: return float("Inf") async def run(self, drone: "Drone"): + assert drone, "Jobs cannot run without a drone being assigned" self.drone = drone self.in_queue_until = time.now self._success = None diff --git a/lapis_tests/__init__.py b/lapis_tests/__init__.py index d0c54e8..722b3a2 100644 --- a/lapis_tests/__init__.py +++ b/lapis_tests/__init__.py @@ -43,3 +43,7 @@ def unregister_drone(drone: Drone): @staticmethod def update_drone(drone: Drone): pass + + +class DummyDrone: + pass diff --git a/lapis_tests/test_job.py b/lapis_tests/test_job.py index 3c75916..181bb1a 100644 --- a/lapis_tests/test_job.py +++ b/lapis_tests/test_job.py @@ -3,7 +3,7 @@ from lapis.drone import Drone from lapis.job import Job -from lapis_tests import via_usim, DummyScheduler +from lapis_tests import via_usim, DummyScheduler, DummyDrone class TestJob(object): @@ -27,10 +27,11 @@ def test_name(self): @via_usim async def test_run_job(self): + drone = DummyDrone() job = Job(resources={"walltime": 50}, used_resources={"walltime": 10}) assert float("inf") == job.waiting_time async with Scope() as scope: - scope.do(job.run()) + scope.do(job.run(drone)) assert 10 == time assert 0 == job.waiting_time assert job.successful From ec28229d01c185863f11dd757203c78769ec4596 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Mon, 25 Nov 2019 13:11:28 +0100 Subject: [PATCH 310/648] added definitions for job, job generator, job queue, and scheduler --- docs/source/glossary.rst | 26 ++++++++++++++++++++++---- 1 file changed, 22 insertions(+), 4 deletions(-) diff --git a/docs/source/glossary.rst b/docs/source/glossary.rst index 1e6ac30..8c5e529 100644 --- a/docs/source/glossary.rst +++ b/docs/source/glossary.rst @@ -18,15 +18,28 @@ Glossary of Terms disintegrate. Drone - Partitionable placeholder for jobs. In the current state of LAPIS a drone - represents a single worker node provided by a specific :term:`pool`. + Partitionable placeholder for :term:`jobs `. In the current state of + LAPIS a drone represents a single worker node provided by a specific + :term:`pool`. .. note:: The concept of drones is introduced by TARDIS. Drones integrate themselves into an HTCondor overlay batch system and thereby provision - the resources for jobs. They act nearly autonomously to e.g. manage - shutdown and error handling if required. + the resources for :term:`jobs `. They act nearly autonomously to + e.g. manage shutdown and error handling if required. + + Job + A task that requires a defined collection of resources to be successfully + processed. Processing of jobs is done by :term:`drones `. + + Job Generator + The Job Generator takes care to continuously create :term:`jobs ` + that are appended to a central :term:`Job Queue` based on job information + provided by one or several job input files. + + Job Queue + Wait queue that contains the :term:`jobs ` in order of creation time. Opportunistic Resource Any resources not permanently dedicated to but temporarily available for @@ -38,6 +51,11 @@ Glossary of Terms available resources e.g. number of cores, memory, or disk. A resource provider can provide a number of pools. + Scheduler + An autonomous process that assigns :term:`jobs ` for execution from + the :term:`Job Queue` to any appropriate :term:`drone`. The process + of job-to-drone-assignments builds on a specified matchmaking logic. + Utilisation Information about the effectiveness of use of resources acquired by a :term:`drone`. From f2fca94553cba29507fee34d0957f833cc425cdd Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Mon, 25 Nov 2019 13:12:24 +0100 Subject: [PATCH 311/648] added information about telegraf monitoring --- docs/source/topics/monitoring.rst | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/docs/source/topics/monitoring.rst b/docs/source/topics/monitoring.rst index a779774..b85d6ba 100644 --- a/docs/source/topics/monitoring.rst +++ b/docs/source/topics/monitoring.rst @@ -77,6 +77,11 @@ COBalD-specific Monitoring Telegraf -------- +LAPIS supports sending monitoring information to telegraf via the CLI option +``--log-telegraf``. The monitoring information for telegraf are sent to the +default UDP logging port ``logging.handlers.DEFAULT_UDP_LOGGING_PORT`` that is +port ``9021``. + Resource Status ~~~~~~~~~~~~~~~ From 8781d42c7c1adf720f470f351c5141a0f82f6309 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Mon, 25 Nov 2019 13:12:53 +0100 Subject: [PATCH 312/648] added placeholder section for caching-specific monitoring --- docs/source/topics/monitoring.rst | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/docs/source/topics/monitoring.rst b/docs/source/topics/monitoring.rst index b85d6ba..1143083 100644 --- a/docs/source/topics/monitoring.rst +++ b/docs/source/topics/monitoring.rst @@ -74,6 +74,13 @@ COBalD-specific Monitoring .. autofunction:: lapis.monitor.cobald.drone_statistics .. autofunction:: lapis.monitor.cobald.pool_statistics +Caching-specific Monitoring +~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. TODO:: + + Will be added as soon as the caching branch is merged. + Telegraf -------- From f58606229c811525949e58a3c9b77fb66f749068 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Tue, 26 Nov 2019 12:22:18 +0100 Subject: [PATCH 313/648] added cobald as another entry for intersphinx_mapping --- docs/conf.py | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/conf.py b/docs/conf.py index b43e6de..db8cc29 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -198,6 +198,7 @@ intersphinx_mapping = { "python": ("https://docs.python.org/3", None), "usim": ("https://usim.readthedocs.io/en/stable", None), + "cobald": ("https://cobald.readthedocs.io/en/stable", None), } # -- Options for todo extension ---------------------------------------------- From b1b1d8f6240764fe71ef70e15e2e3398886f80cd Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Tue, 26 Nov 2019 12:23:16 +0100 Subject: [PATCH 314/648] updated glossary and added links to terms in glossary --- docs/index.rst | 22 +++++++++++----------- docs/source/glossary.rst | 22 ++++++++++++++++++++-- docs/source/topics/monitoring.rst | 2 +- docs/source/topics/support.rst | 30 +++++++++++++++--------------- 4 files changed, 47 insertions(+), 29 deletions(-) diff --git a/docs/index.rst b/docs/index.rst index 32e93a8..fb028a0 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -13,14 +13,14 @@ LAPIS -- Simulations for Opportunistic Resources source/topics/overview source/glossary -The LAPIS simulator enables the simulation of job execution and scheduling with -a focus on :term:`opportunistic resources `. The +The LAPIS simulator enables the simulation of :term:`job` execution and scheduling +with a focus on :term:`opportunistic resources `. The scheduling internally builds on concepts from `HTCondor`_. The :term:`opportunistic resources ` are managed building on the projects `TARDIS`_ and `COBalD`_. The simulation builds on importing well-established input formats to generate -the jobs and set up the infrastructure either in an opportunistic or -classical fashion. +the :term:`jobs ` and set up the infrastructure either in an opportunistic +or classical fashion. Simple Command Line Interface ----------------------------- @@ -38,8 +38,8 @@ mode enabling you to compare the various simulation outputs. :title: Static The *static* environment provides a classical setup where all resources - are available exclusively for processing the jobs for the whole runtime - of the simulation. + are available exclusively for processing the :term:`jobs ` for the + whole runtime of the simulation. .. code-block:: bash @@ -50,8 +50,8 @@ mode enabling you to compare the various simulation outputs. :title: Dynamic The *dynamic* environment builds on volatile, opportunistic resources - exclusively. Based on the amount of jobs being processed within the - simulation COBalD controllers decide about the integration and + exclusively. Based on the amount of :term:`jobs ` being processed + within the simulation COBalD controllers decide about the integration and disintegration of resources. .. code-block:: bash @@ -65,7 +65,7 @@ mode enabling you to compare the various simulation outputs. The *hybrid* simulation environment provides a baseline of static resources that are available for the whole runtime of the simulation. These static resources are dynamically complemented with volatile, opportunistic - resources based on current job pressure. + resources based on current :term:`job` pressure. .. code-block:: bash @@ -73,14 +73,14 @@ mode enabling you to compare the various simulation outputs. --static-pool-file htcondor \ --dynamic-pool-file htcondor -As you can see from the example, you can even mix and match different input +As you can see from the example above, you can even mix and match different input formats to create your required simulation environment. An extensive documentation about the CLI can be found in the :doc:`source/topics/cli` chapter. Simple Framework for Advanced Use Cases --------------------------------------- -The implementation of the simulation itself builds on the lightweight simulation +The simulation is event-driven and builds on the lightweight simulation framework `μSim`_. Due to the human-centric API of μSim, it is a charm to actually read and extend the simulation for adaptation to various use cases. diff --git a/docs/source/glossary.rst b/docs/source/glossary.rst index 8c5e529..aecc32e 100644 --- a/docs/source/glossary.rst +++ b/docs/source/glossary.rst @@ -17,6 +17,19 @@ Glossary of Terms by COBalD to internally decide about which resources to integrate or disintegrate. + Controller + Manages the demand of :term:`drones ` for a given :term:`pool`. + The controller continuously evaluates the :term:`allocation` and + :term:`utilisation` of resources for available :term:`drones ` + for a given :term:`pool` and regulates the current demand to ensure that + best used :term:`drones ` are available via the overlay batch system. + + .. note:: + + Controllers are also initiated for :term:`static pools `. + Their functionality is different from those of opportunistic resources + by initialising the :term:`drones ` only once. + Drone Partitionable placeholder for :term:`jobs `. In the current state of LAPIS a drone represents a single worker node provided by a specific @@ -30,7 +43,7 @@ Glossary of Terms e.g. manage shutdown and error handling if required. Job - A task that requires a defined collection of resources to be successfully + A task that requires a defined set of resources to be successfully processed. Processing of jobs is done by :term:`drones `. Job Generator @@ -51,10 +64,15 @@ Glossary of Terms available resources e.g. number of cores, memory, or disk. A resource provider can provide a number of pools. + The simulation differentiates between static and dynamic pools. While + the specified number of :term:`drones ` is initialised once for + static pools, the demand for :term:`drones ` is continually updated + by a given :term:`controller` for dynamic pools. + Scheduler An autonomous process that assigns :term:`jobs ` for execution from the :term:`Job Queue` to any appropriate :term:`drone`. The process - of job-to-drone-assignments builds on a specified matchmaking logic. + of job-to-drone-assignment builds on a specified matchmaking logic. Utilisation Information about the effectiveness of use of resources acquired by a diff --git a/docs/source/topics/monitoring.rst b/docs/source/topics/monitoring.rst index 1143083..6651137 100644 --- a/docs/source/topics/monitoring.rst +++ b/docs/source/topics/monitoring.rst @@ -16,7 +16,7 @@ object. The monitoring itself runs asynchronously: Whenever elements become available in the monitoring queue, the logging process starts. If you want to define your own logging callable that for example logs information -about changes to a drone it should follow the following format: +about changes to a :term:`drone` it should follow the following format: .. code-block:: python3 diff --git a/docs/source/topics/support.rst b/docs/source/topics/support.rst index 52f08b7..8a16e16 100644 --- a/docs/source/topics/support.rst +++ b/docs/source/topics/support.rst @@ -15,11 +15,11 @@ HTCondor Job Imports ~~~~~~~~~~~ -Jobs can be created directly from HTCondor outputs. Via the ``condor_history`` -command from HTCondor, ClassAds describing a jobs requested and used resources -can be gathered and saved to a csv or json file. -To sufficiently describe a job for the simulation information about requested -and used resources should be included in the export: +:term:`Jobs ` can be created directly from HTCondor outputs. Via the +``condor_history`` command from HTCondor, ClassAds describing a :term:`jobs ` +requested and used resources can be gathered and saved to a csv or json file. +To sufficiently describe a :term:`job` for the simulation information about +requested and used resources should be included in the export: requested resources: RequestCpus, RequestWalltime, RequestMemory, RequestDisk @@ -30,18 +30,18 @@ used resources: additional job information: QDate -If csv is chosen as input file format every line represents a job, columns -should be separated by spaces, comments should be marked by simple quotation -marks. +If csv is chosen as input file format every line represents a :term:`job`, +columns should be separated by spaces, comments should be marked by simple +quotation marks. -If information about a jobs input files are passed to lapis a json file should -contain job descriptions because this file format allows for nested structures. -In this case the json file should contain an array of objects, each representing -a job. +If information about a :term:`jobs ` input files are passed to lapis a json +file should contain :term:`job` descriptions because this file format allows for +nested structures. In this case the json file should contain an array of objects, +each representing a :term:`job`. -ClassAds containing information about a jobs input files are not part of a jobs -standard ClassAds in HTCondor but can be extracted via external tools (e.g. job -submission tools) and stored as Inputfiles ClassAd. +ClassAds containing information about a :term:`jobs ` input files are not +part of a :term:`jobs ` standard ClassAds in HTCondor but can be extracted +via external tools (e.g. job submission tools) and stored as Inputfiles ClassAd. Alternatively this information can be added to the job input file manually. The ``Inputfile`` ClassAd contains dictionary with the input file names serving From 4dc8e3d30d2e68d9529b37f69fb2120ced9bc21e Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Tue, 26 Nov 2019 12:24:51 +0100 Subject: [PATCH 315/648] changed ordering of topics --- docs/source/topics/overview.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/topics/overview.rst b/docs/source/topics/overview.rst index f00d705..fcd80ff 100644 --- a/docs/source/topics/overview.rst +++ b/docs/source/topics/overview.rst @@ -7,6 +7,6 @@ This is a collection of separate topics on LAPIS. :maxdepth: 1 concept + monitoring cli support - monitoring From 10678c60c9fc20b261cfd4f04ef4f471f114fe43 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Tue, 26 Nov 2019 12:25:21 +0100 Subject: [PATCH 316/648] added contents for overall description of relevant components in simulation --- docs/source/topics/concept.rst | 134 +++++++++++++++++++++++++++++++++ 1 file changed, 134 insertions(+) diff --git a/docs/source/topics/concept.rst b/docs/source/topics/concept.rst index e555972..b571a13 100644 --- a/docs/source/topics/concept.rst +++ b/docs/source/topics/concept.rst @@ -1,2 +1,136 @@ Simulation Concept ================== + +Background +---------- + +.. TODO:: + + HEP context. + +Components +---------- + +The core simulation builds on several components, and concepts: + +* :term:`Job Generator`, +* :term:`Job Queue`, +* :term:`Pools ` and their :term:`Controllers `, +* :term:`Drones `, and +* the :term:`Scheduler`, + +If you are planning to adapt the simulation for your specific use case, please +consider the different components to determine what and where to extend functionality. + +Job Generator +~~~~~~~~~~~~~ + +The Job Generator works on the job input files. It takes care to +translate time-based characteristics of the :term:`jobs ` into simulation +time. For this the timestamp of the first :term:`job` of each job input file is +taken as the ``base`` timestamp, resulting in a time value of ``0`` for the +first :term:`job`. All following :term:`jobs ` are adapted accordingly, +i.e. time is ``time - base``. + +The Job Generator itself acts as a generator, meaning that a :term:`job` is put +into the simulations :term:`Job Queue` as soon as the simulation time corresponds +to the translated :term:`job` queueing time. + +Job Queue +~~~~~~~~~ + +The Job Queue is filled with :term:`jobs ` in creation-time order by the +:term:`Job Generator`. The queue is managed by the :term:`scheduler` and contains +all :term:`jobs ` that are not yet scheduled to a :term:`drone` as well as +:term:`jobs ` that have not yet been processed succesfully. + +Pools +~~~~~ + +Pools are created based on the pool input files. Each pool is characterised by +a set of defined resources. Further, pools have a ``capacity`` number of +:term:`drones ` that can be created from a given pool. If the capacity +is not specified, a maximum capacity of ``float("inf")`` is assumed. + +For pools, we differentiate static and dynamic pools. While static pools are +intialised with a fixed amount of :term:`drones `, the number of +:term:`drones ` is adapted dynamically by the +:term:`pool controller ` for dynamic pools. + +.. autoclass:: lapis.pool.Pool +.. autoclass:: lapis.pool.StaticPool + +Controllers +~~~~~~~~~~~ + +Each :term:`pool` is started with an appropriate controller. Each controller runs +periodically to check :term:`allocation` and :term:`utilisation` of assigned +:term:`pool(s) ` to regulate the demand of :term:`drones ` for the +given :term:`pool`. + +The concept of controllers is introduced by COBalD. The controllers implemented +in LAPIS share the general concept as well as implementation by subclassing +provided controllers such as :py:class:`cobald.controller.linear.LinearController` +or :py:class:`cobald.controller.relative_supply.RelativeSupplyController` and +overwriting :py:meth:`lapis.controller.SimulatedLinearController.run`. In +this way, we enable validation of current TARDIS/COBalD setup as well as simulation +of future extensions. + +Available controller implementations from COBalD in LAPIS are: + +.. autoclass:: lapis.controller.SimulatedLinearController + :members: + +.. autoclass:: lapis.controller.SimulatedRelativeSupplyController + :members: + +And there is also an implementation considered as an extension for COBalD: + +.. autoclass:: lapis.controller.SimulatedCostController + :members: + +Drones +~~~~~~ + +Drones provide instances of the set of resources defined by a given :term:`pool`. +Drones are the only objects in the simulation that are able to process +:term:`jobs `. Simplified, drones represent worker nodes. + +The concept of drones is introduced by TARDIS. A drone is a generalisation of +the pilot concept used for example in High Energy Physics and is a placeholder +for the real workloads to be processed. A drone is expected to autonomously +manage its lifecycle, meaning, that it handles failures and termination +independently from other components within the system. + +.. warning:: + + Drones are not yet fully employed in LAPIS. They already run independently + but do not handle termination themselves. + +Scheduler +~~~~~~~~~ + +The scheduler is the connecting component between the :term:`jobs ` in the +:term:`job queue` and the running :term:`drones `. It does the matchmaking +between :term:`jobs ` and :term:`drones ` to assign the +:term:`jobs ` to the best evaluated :term:`drone`. Whenever a :term:`job` +is assigned to a :term:`drone`, the :term:`job` is removed from the +:term:`job queue`. The scheduler is notified as soon as the :term:`job` is +terminated independent from the state of termination. It is the task of the +scheduler to decide to either remove the :term:`job` from the simulation in case +of success or to re-insert the :term:`job` into the :term:`job queue` to retry +processing. + +LAPIS currently supports an HTCondor-like implementation of a scheduler: + +.. autoclass:: lapis.scheduler.CondorJobScheduler + :members: + +.. warning:: + + The implementation of the HTCondor scheduler is still very rough. + The matchmaking currently does not rely on given ``requirements``, but only + considers required and provided ``resources`` for :term:`jobs ` and + :term:`drones `. The automatic clustering, therefore, also only relies + on the type and number of ``resources`` and is applied to :term:`drones ` + only at the moment. From b07e629030acad07fab1c747dc2d98a879542548 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Tue, 26 Nov 2019 13:47:07 +0100 Subject: [PATCH 317/648] Apply suggestions from code review Co-Authored-By: Max Fischer --- docs/index.rst | 4 ++-- docs/source/glossary.rst | 2 +- docs/source/topics/concept.rst | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/index.rst b/docs/index.rst index fb028a0..2893515 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -29,8 +29,8 @@ Although LAPIS is written to provide an extensive framework for setting up advanced simulation, it also provides a simple command line interface to get you started quickly. -You have the options to start in 1) static, 2) dynamic as well as 3) hybrid -mode enabling you to compare the various simulation outputs. +You have the options to start in 1) static, 2) dynamic, or 3) hybrid +mode enabling you to compare the various simulation scenarios. .. content-tabs:: diff --git a/docs/source/glossary.rst b/docs/source/glossary.rst index aecc32e..54cbef7 100644 --- a/docs/source/glossary.rst +++ b/docs/source/glossary.rst @@ -14,7 +14,7 @@ Glossary of Terms .. note:: The general concept of allocation and :term:`utilisation` is introduced - by COBalD to internally decide about which resources to integrate or + by COBalD to internally decide which resources to integrate or disintegrate. Controller diff --git a/docs/source/topics/concept.rst b/docs/source/topics/concept.rst index b571a13..3e61b75 100644 --- a/docs/source/topics/concept.rst +++ b/docs/source/topics/concept.rst @@ -25,7 +25,7 @@ consider the different components to determine what and where to extend function Job Generator ~~~~~~~~~~~~~ -The Job Generator works on the job input files. It takes care to +The Job Generator processes any job input files. It takes care to translate time-based characteristics of the :term:`jobs ` into simulation time. For this the timestamp of the first :term:`job` of each job input file is taken as the ``base`` timestamp, resulting in a time value of ``0`` for the From 8bac6a8a8358f06ddd36da2bf805d1cee25124bf Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Tue, 26 Nov 2019 14:28:17 +0100 Subject: [PATCH 318/648] removed unnecessary information about job input files that are outdated already --- docs/source/topics/support.rst | 76 +++++----------------------------- 1 file changed, 11 insertions(+), 65 deletions(-) diff --git a/docs/source/topics/support.rst b/docs/source/topics/support.rst index 8a16e16..fd35b99 100644 --- a/docs/source/topics/support.rst +++ b/docs/source/topics/support.rst @@ -17,7 +17,7 @@ Job Imports :term:`Jobs ` can be created directly from HTCondor outputs. Via the ``condor_history`` command from HTCondor, ClassAds describing a :term:`jobs ` -requested and used resources can be gathered and saved to a csv or json file. +requested and used resources can be gathered and saved to a csv file. To sufficiently describe a :term:`job` for the simulation information about requested and used resources should be included in the export: @@ -28,74 +28,20 @@ used resources: RemoteWallClockTime, MemoryUsage, DiskUsage_RAW, RemoteSysCpu, RemoteUserCpu additional job information: - QDate + QDate, GlobalJobId -If csv is chosen as input file format every line represents a :term:`job`, -columns should be separated by spaces, comments should be marked by simple -quotation marks. +In the csv file format every line represents a :term:`job`. The columns are +separated by spaces, and comments are marked by simple quotation marks. -If information about a :term:`jobs ` input files are passed to lapis a json -file should contain :term:`job` descriptions because this file format allows for -nested structures. In this case the json file should contain an array of objects, -each representing a :term:`job`. +.. note:: -ClassAds containing information about a :term:`jobs ` input files are not -part of a :term:`jobs ` standard ClassAds in HTCondor but can be extracted -via external tools (e.g. job submission tools) and stored as Inputfiles ClassAd. -Alternatively this information can be added to the job input file manually. + If information about the input files of a :term:`jobs ` should be passed + to LAPIS, a separate csv file is required. This feature is not provided yet, + but will be added in one of the next versions. -The ``Inputfile`` ClassAd contains dictionary with the input file names serving -as keys and subdictionaries further describing the input files. -These subdictionaries provide - -filesize: - the files total size in MB - -usedsize: - the amount of data the job actually reads from this file in MB - -.. code-block:: csv - - TODO - -.. code-block:: json - - [ - { - "QDate": 1567169672, - "RequestCpus": 1, - "RequestWalltime": 60, - "RequestMemory": 2000, - "RequestDisk": 6000000, - "RemoteWallClockTime": 100.0, - "MemoryUsage": 2867, - "DiskUsage_RAW": 41898, - "RemoteSysCpu": 10.0, - "RemoteUserCpu": 40.0, - }, - { - "QDate": 1567169672, - "RequestCpus": 1, - "RequestWalltime": 60, - "RequestMemory": 2000, - "RequestDisk": 6000000, - "RemoteWallClockTime": 100.0, - "MemoryUsage": 2867, - "DiskUsage_RAW": 41898, - "RemoteSysCpu": 10.0, - "RemoteUserCpu": 40.0, - "Inputfiles": { - "a.root": { - "filesize": 25000, - "usedsize": 20000 - }, - "b.root": { - "filesize": 25000, - "usedsize": 20000 - } - } - } - ] +Input file information of jobs are not part of the standard :term:`jobs ` +ClassAds in HTCondor but can be extracted via external tools (e.g. job submission +tools). SWF Format ---------- From e967891ddb80cbf791d986ff0dd5e4ab4a06f0a8 Mon Sep 17 00:00:00 2001 From: Max Fischer Date: Tue, 26 Nov 2019 15:41:56 +0100 Subject: [PATCH 319/648] Update docs/source/topics/concept.rst Co-Authored-By: Eileen Kuehn --- docs/source/topics/concept.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/topics/concept.rst b/docs/source/topics/concept.rst index 3e61b75..1e17a2d 100644 --- a/docs/source/topics/concept.rst +++ b/docs/source/topics/concept.rst @@ -63,7 +63,7 @@ intialised with a fixed amount of :term:`drones `, the number of Controllers ~~~~~~~~~~~ -Each :term:`pool` is started with an appropriate controller. Each controller runs +Each :term:`pool` is managed by a controller. Each controller runs periodically to check :term:`allocation` and :term:`utilisation` of assigned :term:`pool(s) ` to regulate the demand of :term:`drones ` for the given :term:`pool`. From c80321eccac2b69e9295d094da5365d16c22638e Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Tue, 26 Nov 2019 16:11:04 +0100 Subject: [PATCH 320/648] added requirement to change-log and formatted file --- pyproject.toml | 36 +++++++++++++++++++----------------- 1 file changed, 19 insertions(+), 17 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 5dc1fbd..3217107 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -12,23 +12,19 @@ description-file = "README.rst" keywords = "htcondor simulation python cobald tardis opportunistic scheduling scheduler" classifiers = [ "License :: OSI Approved :: MIT License", - 'Development Status :: 2 - Pre-Alpha', - 'Intended Audience :: Developers', - 'Intended Audience :: Information Technology', - 'Intended Audience :: Science/Research', - 'Intended Audience :: System Administrators', - 'Topic :: Adaptive Technologies', - 'Topic :: Office/Business :: Scheduling', - 'Topic :: System :: Distributed Computing', - 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.6', - 'Programming Language :: Python :: 3.7' -] -requires = [ - "cobald", - "usim == 0.4", - "click", + "Development Status :: 2 - Pre-Alpha", + "Intended Audience :: Developers", + "Intended Audience :: Information Technology", + "Intended Audience :: Science/Research", + "Intended Audience :: System Administrators", + "Topic :: Adaptive Technologies", + "Topic :: Office/Business :: Scheduling", + "Topic :: System :: Distributed Computing", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.6", + "Programming Language :: Python :: 3.7", ] +requires = ["cobald", "usim == 0.4", "click"] [tool.flit.metadata.requires-extra] test = [ @@ -37,7 +33,13 @@ test = [ "flake8-bugbear", "black; implementation_name=='cpython'", ] -doc = ["sphinx", "sphinx_rtd_theme", "sphinxcontrib-contentui", "sphinx-click"] +doc = [ + "sphinx", + "sphinx_rtd_theme", + "sphinxcontrib-contentui", + "sphinx-click", + "change-log", +] dev = ["pre-commit"] [tool.flit.metadata.urls] From d4c82a6203d28b4ac8c04a4dd66348419f7edf25 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Wed, 27 Nov 2019 15:56:45 +0100 Subject: [PATCH 321/648] added changelog to toc --- docs/index.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/index.rst b/docs/index.rst index 2893515..5124ee9 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -12,6 +12,7 @@ LAPIS -- Simulations for Opportunistic Resources source/topics/overview source/glossary + source/changelog The LAPIS simulator enables the simulation of :term:`job` execution and scheduling with a focus on :term:`opportunistic resources `. The From cb6197937c5cd9f6fab44a0994dfa5a5c20da35d Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Wed, 27 Nov 2019 15:57:57 +0100 Subject: [PATCH 322/648] added overview of versions --- docs/source/changes/versions.yaml | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 docs/source/changes/versions.yaml diff --git a/docs/source/changes/versions.yaml b/docs/source/changes/versions.yaml new file mode 100644 index 0000000..904ba40 --- /dev/null +++ b/docs/source/changes/versions.yaml @@ -0,0 +1,6 @@ +- semver: 0.3.0 + date: '2019-10-27' +- semver: 0.2.0 + date: '2019-10-25' +- semver: 0.1.1 + date: '2019-10-24' From 674eddcb432943990963c49a457c8234d89fe3bb Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Wed, 27 Nov 2019 16:02:21 +0100 Subject: [PATCH 323/648] added information for changes of version v0.1.1 --- docs/source/changes/10.logging_extension.yaml | 12 ++++++++++++ .../source/changes/19.distribution_and_license.yaml | 11 +++++++++++ docs/source/changes/23.usim_api_adaptations.yaml | 9 +++++++++ docs/source/changes/25.stopping_job_generator.yaml | 8 ++++++++ docs/source/changes/26.terminate_simulation.yaml | 12 ++++++++++++ docs/source/changes/28.cleanup.yaml | 8 ++++++++ docs/source/changes/29.resource_ratio.yaml | 10 ++++++++++ docs/source/changes/3.flake8.yaml | 8 ++++++++ docs/source/changes/6.swf_import_corrections.yaml | 13 +++++++++++++ 9 files changed, 91 insertions(+) create mode 100644 docs/source/changes/10.logging_extension.yaml create mode 100644 docs/source/changes/19.distribution_and_license.yaml create mode 100644 docs/source/changes/23.usim_api_adaptations.yaml create mode 100644 docs/source/changes/25.stopping_job_generator.yaml create mode 100644 docs/source/changes/26.terminate_simulation.yaml create mode 100644 docs/source/changes/28.cleanup.yaml create mode 100644 docs/source/changes/29.resource_ratio.yaml create mode 100644 docs/source/changes/3.flake8.yaml create mode 100644 docs/source/changes/6.swf_import_corrections.yaml diff --git a/docs/source/changes/10.logging_extension.yaml b/docs/source/changes/10.logging_extension.yaml new file mode 100644 index 0000000..29c94ed --- /dev/null +++ b/docs/source/changes/10.logging_extension.yaml @@ -0,0 +1,12 @@ +category: changed +summary: "Extension of logging" +description: | + The logging of predefined logging functions now follows a specified database + structure. The structure is documented in the documentation of the package. + Further, the logging was extended to additionally support logging to telegraf + using the python default UDP logging port and the LineProtocolFormat of telegraf. + The timestamps of the log messages follow the simulation time. +pull requests: + - 10 + - 14 +version: 0.1.1 diff --git a/docs/source/changes/19.distribution_and_license.yaml b/docs/source/changes/19.distribution_and_license.yaml new file mode 100644 index 0000000..5f3d9eb --- /dev/null +++ b/docs/source/changes/19.distribution_and_license.yaml @@ -0,0 +1,11 @@ +category: changed +summary: "Distribution setup and license information" +description: | + Usim is a new requirement for installing the package. Further, the distribution + process now uses flit and, therefore, setup.py was replaced by pyproject.toml. + Finally, the license file was adapted to also include Max as an author. +pull requests: + - 19 + - 33 + - 30 +version: 0.1.1 diff --git a/docs/source/changes/23.usim_api_adaptations.yaml b/docs/source/changes/23.usim_api_adaptations.yaml new file mode 100644 index 0000000..f1c212e --- /dev/null +++ b/docs/source/changes/23.usim_api_adaptations.yaml @@ -0,0 +1,9 @@ +category: changed +summary: "Support of current API of usim" +description: | + In preparation to support the usim features of borrowing and claiming resources + guaranteeing synchronisation of available resources in drones, the implementation was + adapted to meet the current requireements of usim v0.3. +pull requests: + - 23 +version: 0.1.1 diff --git a/docs/source/changes/25.stopping_job_generator.yaml b/docs/source/changes/25.stopping_job_generator.yaml new file mode 100644 index 0000000..a7d5608 --- /dev/null +++ b/docs/source/changes/25.stopping_job_generator.yaml @@ -0,0 +1,8 @@ +category: fixed +summary: "StopIteration handling by Job Generator" +description: | + The Job Generator so far did not properly handle StopIterations while importing + jobs. This has been fixed. +pull requests: + - 25 +version: 0.1.1 diff --git a/docs/source/changes/26.terminate_simulation.yaml b/docs/source/changes/26.terminate_simulation.yaml new file mode 100644 index 0000000..4ca1327 --- /dev/null +++ b/docs/source/changes/26.terminate_simulation.yaml @@ -0,0 +1,12 @@ +category: fixed +summary: "Termination of simulation" +description: | + The scheduler so far did not have any information about when the simulation + was expected to terminate. Therefore, a new property `_collecting` was introduced + in the simlulator. This property is bound to the job queue. As soon as the + job generator does not produce any new jobs the job queue is closed and after + all jobs were removed, the property `_collecting` is set to `False` to + initiate termination of simulation. +pull requests: + - 26 +version: 0.1.1 diff --git a/docs/source/changes/28.cleanup.yaml b/docs/source/changes/28.cleanup.yaml new file mode 100644 index 0000000..7611c72 --- /dev/null +++ b/docs/source/changes/28.cleanup.yaml @@ -0,0 +1,8 @@ +category: changed +summary: "Cleanup and improvements of existing code" +description: | + usim related code was simplified and unused code was removed. +pull requests: + - 28 + - 32 +version: 0.1.1 diff --git a/docs/source/changes/29.resource_ratio.yaml b/docs/source/changes/29.resource_ratio.yaml new file mode 100644 index 0000000..8b394dc --- /dev/null +++ b/docs/source/changes/29.resource_ratio.yaml @@ -0,0 +1,10 @@ +category: fixed +summary: "Calculation of used and requested resource ratio" +description: | + The introduction of the borrowing and claiming concept of resources provided + by usim changed the way resources were handled internally. Therefore, the + calculation of used and requested resource ratios had to be adapted to not + result in wrong results. +pull requests: + - 29 +version: 0.1.1 diff --git a/docs/source/changes/3.flake8.yaml b/docs/source/changes/3.flake8.yaml new file mode 100644 index 0000000..a2a16fa --- /dev/null +++ b/docs/source/changes/3.flake8.yaml @@ -0,0 +1,8 @@ +category: added +summary: "Requirement for flake8" +description: | + Flake8 was added as a requirement for continuous integration. Further, all + issues with flake8 were fixed. +pull requests: + - 3 +version: 0.1.1 diff --git a/docs/source/changes/6.swf_import_corrections.yaml b/docs/source/changes/6.swf_import_corrections.yaml new file mode 100644 index 0000000..c6a8d4a --- /dev/null +++ b/docs/source/changes/6.swf_import_corrections.yaml @@ -0,0 +1,13 @@ +category: fixed +summary: "Importing of SWF files" +description: | + SWF imports did not properly do the unit conversion but instead accessed a + wrong dictionary to get the conversion information. Now they use the correct + dictionary. + Further, SWF defines values of ``-1`` as a default for user values that have + not been specified. However, negative values of resources in LAPIS are not + supported. Therefore, negative values now default to ``0`` in import process.ß +pull requests: + - 6 + - 22 +version: 0.1.1 From ca7aa49eb5101a00ab4ce16ceada85f76ff06c8b Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Wed, 27 Nov 2019 16:03:11 +0100 Subject: [PATCH 324/648] added information for changes of version v0.2.0 --- docs/source/changes/36.project_renamed.yaml | 8 ++++++++ docs/source/changes/37.usim_api_adaptation.yaml | 8 ++++++++ 2 files changed, 16 insertions(+) create mode 100644 docs/source/changes/36.project_renamed.yaml create mode 100644 docs/source/changes/37.usim_api_adaptation.yaml diff --git a/docs/source/changes/36.project_renamed.yaml b/docs/source/changes/36.project_renamed.yaml new file mode 100644 index 0000000..1684f8c --- /dev/null +++ b/docs/source/changes/36.project_renamed.yaml @@ -0,0 +1,8 @@ +category: changed +summary: "Rename from lapis to lapis-sim for pypi and rtd" +description: | + The name *lapis* in pypi was already taken so we had to change the distribution + name and decided to go for *lapis-sim*. +pull requests: + - 36 +version: 0.2.0 diff --git a/docs/source/changes/37.usim_api_adaptation.yaml b/docs/source/changes/37.usim_api_adaptation.yaml new file mode 100644 index 0000000..fa65b4b --- /dev/null +++ b/docs/source/changes/37.usim_api_adaptation.yaml @@ -0,0 +1,8 @@ +category: changed +summary: "Support of current API of usim" +description: | + In preparation to support upcomming features of usim, the current usage had to + be adapted to meet the current requireements of usim v0.4. +pull requests: + - 37 +version: 0.2.0 From 45543b4c99c496e468df0fed5293f661d63c24bb Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Wed, 27 Nov 2019 16:04:56 +0100 Subject: [PATCH 325/648] added information for changes of version v0.3.0 --- docs/source/changes/44.terminate_simulation.yaml | 13 +++++++++++++ docs/source/changes/45.logging_improvements.yaml | 14 ++++++++++++++ .../46.resource_usage_in_scheduling_cycle.yaml | 12 ++++++++++++ docs/source/changes/47.pre_commit_hooks.yaml | 8 ++++++++ 4 files changed, 47 insertions(+) create mode 100644 docs/source/changes/44.terminate_simulation.yaml create mode 100644 docs/source/changes/45.logging_improvements.yaml create mode 100644 docs/source/changes/46.resource_usage_in_scheduling_cycle.yaml create mode 100644 docs/source/changes/47.pre_commit_hooks.yaml diff --git a/docs/source/changes/44.terminate_simulation.yaml b/docs/source/changes/44.terminate_simulation.yaml new file mode 100644 index 0000000..1f60138 --- /dev/null +++ b/docs/source/changes/44.terminate_simulation.yaml @@ -0,0 +1,13 @@ +category: fixed +summary: "Proper termination of simulation" +description: | + usim defines non-volatile and volatile running of async tasks within scopes. + Volatile tasks can be finished by the outer scope when the outer scope + terminates while non-volatile tasks must be finished before leaving the scope. + So far, all tasks were run as non-volatile in lapis resulting in simulations + that never finished as ``pools``, and ``controllers`` for example were running + forever. This is fixed now by starting those tasks as volatile when running + a simulation. +pull requests: + - 44 +version: 0.3.0 diff --git a/docs/source/changes/45.logging_improvements.yaml b/docs/source/changes/45.logging_improvements.yaml new file mode 100644 index 0000000..e35f12b --- /dev/null +++ b/docs/source/changes/45.logging_improvements.yaml @@ -0,0 +1,14 @@ +category: changed +summary: "Object-based logging and logging for job events" +description: | + So far the logging during simulation was flag-based. A soon as the logging flag + was set to ``True``, the logging process started. This created a lot of overhead. + The logging now is object-based, meaning that only objects with relevant changes + for logging are added to a logging queue. Each of those objects in the logging + queue are processed by registered logging functions. + + Further, now also the logging of job events is supported through this global + logging process. +pull requests: + - 45 +version: 0.3.0 diff --git a/docs/source/changes/46.resource_usage_in_scheduling_cycle.yaml b/docs/source/changes/46.resource_usage_in_scheduling_cycle.yaml new file mode 100644 index 0000000..2103374 --- /dev/null +++ b/docs/source/changes/46.resource_usage_in_scheduling_cycle.yaml @@ -0,0 +1,12 @@ +category: fixed +summary: "Update of available resources during scheduling cycle" +description: | + Until now jobs took care on updating available resources after a job was + started resulting in an adaption of the auto clustering in the scheduler. + As the starting of jobs took longer than the scheduling within one scheduling + cycle another job could be assigned although the resources were gone already. + This is fixed now by temporarily assuming resource allocation after a job was + sent to a drone within the scheduler itself. +pull requests: + - 46 +version: 0.3.0 diff --git a/docs/source/changes/47.pre_commit_hooks.yaml b/docs/source/changes/47.pre_commit_hooks.yaml new file mode 100644 index 0000000..42c3eff --- /dev/null +++ b/docs/source/changes/47.pre_commit_hooks.yaml @@ -0,0 +1,8 @@ +category: added +summary: "Pre-commit hooks" +description: | + LAPIS now defines some pre-commit hooks including the execution of black for + proper formatting of source code. All files have, therefore, also been blackened. +pull requests: + - 47 +version: 0.3.0 From f9337a71b45fdf288a71590ca7783fdc54f5e69f Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Wed, 27 Nov 2019 16:06:53 +0100 Subject: [PATCH 326/648] added information for changes for next version --- docs/source/changes/27.documentation.yaml | 8 ++++++++ .../changes/49.htcondor_import_corrections.yaml | 7 +++++++ docs/source/changes/50.black_usage.yaml | 7 +++++++ docs/source/changes/51.job_input_files.yaml | 7 +++++++ docs/source/changes/59.unit_standardisation.yaml | 7 +++++++ docs/source/changes/63.execution_of_jobs.yaml | 11 +++++++++++ docs/source/changes/66.terminate_simulation.yaml | 9 +++++++++ docs/source/changes/69.job_drone_requirement.yaml | 7 +++++++ 8 files changed, 63 insertions(+) create mode 100644 docs/source/changes/27.documentation.yaml create mode 100644 docs/source/changes/49.htcondor_import_corrections.yaml create mode 100644 docs/source/changes/50.black_usage.yaml create mode 100644 docs/source/changes/51.job_input_files.yaml create mode 100644 docs/source/changes/59.unit_standardisation.yaml create mode 100644 docs/source/changes/63.execution_of_jobs.yaml create mode 100644 docs/source/changes/66.terminate_simulation.yaml create mode 100644 docs/source/changes/69.job_drone_requirement.yaml diff --git a/docs/source/changes/27.documentation.yaml b/docs/source/changes/27.documentation.yaml new file mode 100644 index 0000000..fb48fb5 --- /dev/null +++ b/docs/source/changes/27.documentation.yaml @@ -0,0 +1,8 @@ +category: added +summary: "Basic documentation" +description: | + LAPIS now includes a basic documentation about the different components and + concepts, importing jobs and pools, the logging process and database structure + as well as the command line interface. +pull requests: + - 27 diff --git a/docs/source/changes/49.htcondor_import_corrections.yaml b/docs/source/changes/49.htcondor_import_corrections.yaml new file mode 100644 index 0000000..ccce8cd --- /dev/null +++ b/docs/source/changes/49.htcondor_import_corrections.yaml @@ -0,0 +1,7 @@ +category: fixed +summary: "Importing of HTCondor jobs" +description: | + The unit conversion for some of the values from HTCondor jobs did not work + properly as values were overwritten. This is fixed now. +pull requests: + - 49 diff --git a/docs/source/changes/50.black_usage.yaml b/docs/source/changes/50.black_usage.yaml new file mode 100644 index 0000000..a766b64 --- /dev/null +++ b/docs/source/changes/50.black_usage.yaml @@ -0,0 +1,7 @@ +category: fixed +summary: "Handling of black for pypy" +description: | + Black does not work when running CI with pypy. The usage of black with pypy + therefore has been removed now. +pull requests: + - 50 diff --git a/docs/source/changes/51.job_input_files.yaml b/docs/source/changes/51.job_input_files.yaml new file mode 100644 index 0000000..9df0ef3 --- /dev/null +++ b/docs/source/changes/51.job_input_files.yaml @@ -0,0 +1,7 @@ +category: added +summary: "Information about input files for jobs" +description: | + In preparation for enabling caching and its effects within the simulation the + support of input files for jobs has been added. +pull requests: + - 51 diff --git a/docs/source/changes/59.unit_standardisation.yaml b/docs/source/changes/59.unit_standardisation.yaml new file mode 100644 index 0000000..53adaa0 --- /dev/null +++ b/docs/source/changes/59.unit_standardisation.yaml @@ -0,0 +1,7 @@ +category: changed +summary: "Standardisation of units" +description: | + The usage of units was not standardised so far. We now introduced to solely work + with Bytes and use ``int`` for representation throughout the simulation. +pull requests: + - 59 diff --git a/docs/source/changes/63.execution_of_jobs.yaml b/docs/source/changes/63.execution_of_jobs.yaml new file mode 100644 index 0000000..49eb3b8 --- /dev/null +++ b/docs/source/changes/63.execution_of_jobs.yaml @@ -0,0 +1,11 @@ +category: fixed +summary: "Jobs execution within drones" +description: | + Until now jobs have been started implictly by the scheduler within its scheduling + cycle. This created some issues for asynchronous tasks within jobs as those were + bound to the duration of the scheduling cycle. Therefore, we introduced a new + queue within drones that receive the jobs from the scheduler. Further, the drone + now takes care to properly start the job so that it runs independently now in + the scope of the drone. +pull requests: + - 63 diff --git a/docs/source/changes/66.terminate_simulation.yaml b/docs/source/changes/66.terminate_simulation.yaml new file mode 100644 index 0000000..32d929c --- /dev/null +++ b/docs/source/changes/66.terminate_simulation.yaml @@ -0,0 +1,9 @@ +category: fixed +summary: "Proper termination of simulation" +description: | + The simulation so far only awaited that the job queue became empty. This however + did not ensure that the jobs also properly finished. Therefore, the scheduler + now has a new property to store the number of running jobs. So the termination + of the scheduler is now additionally bound to this counter getting ``0``. +pull requests: + - 66 diff --git a/docs/source/changes/69.job_drone_requirement.yaml b/docs/source/changes/69.job_drone_requirement.yaml new file mode 100644 index 0000000..26823aa --- /dev/null +++ b/docs/source/changes/69.job_drone_requirement.yaml @@ -0,0 +1,7 @@ +category: added +summary: "Drone as a requirement to run a job" +description: | + Until now the run method of jobs did not require any parameter. This is changed + now and relies on a parameter for the executing drone. +pull requests: + - 69 From bb0aa40fef85fec8f2fcf99c2df504b152b3bbc5 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Wed, 27 Nov 2019 16:07:19 +0100 Subject: [PATCH 327/648] added generated changelog --- docs/source/changelog.rst | 63 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 63 insertions(+) create mode 100644 docs/source/changelog.rst diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst new file mode 100644 index 0000000..c59ff9c --- /dev/null +++ b/docs/source/changelog.rst @@ -0,0 +1,63 @@ +.. Created by log.py at 2019-11-27, command + '/Users/eileenwork/development/work/lapis/venv/lib/python3.7/site-packages/change/__main__.py log docs/source/changes compile --output docs/source/changelog.rst' + based on the format of 'https://keepachangelog.com/' +######### +ChangeLog +######### + +Upcoming +======== + +Version [Unreleased] - 2019-11-27 ++++++++++++++++++++++++++++++++++ + +* **[Added]** Basic documentation +* **[Added]** Information about input files for jobs +* **[Added]** Drone as a requirement to run a job + +* **[Changed]** Standardisation of units + +* **[Fixed]** Handling of black for pypy +* **[Fixed]** Proper termination of simulation +* **[Fixed]** Jobs execution within drones +* **[Fixed]** Importing of HTCondor jobs + +0.3 Series +========== + +Version [0.3.0] - 2019-10-27 +++++++++++++++++++++++++++++ + +* **[Added]** Pre-commit hooks + +* **[Changed]** Object-based logging and logging for job events + +* **[Fixed]** Proper termination of simulation +* **[Fixed]** Update of available resources during scheduling cycle + +0.2 Series +========== + +Version [0.2.0] - 2019-10-25 +++++++++++++++++++++++++++++ + +* **[Changed]** Support of current API of usim +* **[Changed]** Rename from lapis to lapis-sim for pypi and rtd + +0.1 Series +========== + +Version [0.1.1] - 2019-10-24 +++++++++++++++++++++++++++++ + +* **[Added]** Requirement for flake8 + +* **[Changed]** Support of current API of usim +* **[Changed]** Distribution setup and license information +* **[Changed]** Cleanup and improvements of existing code +* **[Changed]** Extension of logging + +* **[Fixed]** Termination of simulation +* **[Fixed]** Calculation of used and requested resource ratio +* **[Fixed]** StopIteration handling by Job Generator +* **[Fixed]** Importing of SWF files From 698af6bfb01a81a2bb31bc38dec51ca57ba0f5de Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Wed, 27 Nov 2019 16:10:15 +0100 Subject: [PATCH 328/648] added change for current PR --- docs/source/changelog.rst | 1 + docs/source/changes/68.changelog.yaml | 6 ++++++ 2 files changed, 7 insertions(+) create mode 100644 docs/source/changes/68.changelog.yaml diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst index c59ff9c..35bf30a 100644 --- a/docs/source/changelog.rst +++ b/docs/source/changelog.rst @@ -11,6 +11,7 @@ Upcoming Version [Unreleased] - 2019-11-27 +++++++++++++++++++++++++++++++++ +* **[Added]** Changelog * **[Added]** Basic documentation * **[Added]** Information about input files for jobs * **[Added]** Drone as a requirement to run a job diff --git a/docs/source/changes/68.changelog.yaml b/docs/source/changes/68.changelog.yaml new file mode 100644 index 0000000..9b0e67b --- /dev/null +++ b/docs/source/changes/68.changelog.yaml @@ -0,0 +1,6 @@ +category: added +summary: "Changelog" +description: | + The documentation now includes a changelog up to the current version. +pull requests: + - 68 From f538a467f671b36cc583d5b02e9f10a2ad267942 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Wed, 27 Nov 2019 19:52:47 +0100 Subject: [PATCH 329/648] corrected pull request reference in change fragment --- docs/source/changelog.rst | 2 +- docs/source/changes/{68.changelog.yaml => 74.changelog.yaml} | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) rename docs/source/changes/{68.changelog.yaml => 74.changelog.yaml} (95%) diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst index 35bf30a..c60e614 100644 --- a/docs/source/changelog.rst +++ b/docs/source/changelog.rst @@ -11,8 +11,8 @@ Upcoming Version [Unreleased] - 2019-11-27 +++++++++++++++++++++++++++++++++ -* **[Added]** Changelog * **[Added]** Basic documentation +* **[Added]** Changelog * **[Added]** Information about input files for jobs * **[Added]** Drone as a requirement to run a job diff --git a/docs/source/changes/68.changelog.yaml b/docs/source/changes/74.changelog.yaml similarity index 95% rename from docs/source/changes/68.changelog.yaml rename to docs/source/changes/74.changelog.yaml index 9b0e67b..1fa7889 100644 --- a/docs/source/changes/68.changelog.yaml +++ b/docs/source/changes/74.changelog.yaml @@ -3,4 +3,4 @@ summary: "Changelog" description: | The documentation now includes a changelog up to the current version. pull requests: - - 68 + - 74 From 4792d01a6a9e6ac56d0e55ed94687b728c0b98c3 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Fri, 29 Nov 2019 17:26:07 +0100 Subject: [PATCH 330/648] fixed that drones could be registered twice --- lapis/drone.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/lapis/drone.py b/lapis/drone.py index 0e6845a..1932537 100644 --- a/lapis/drone.py +++ b/lapis/drone.py @@ -36,11 +36,7 @@ def __init__( else: self._valid_resource_keys = self.pool_resources.keys() self.scheduling_duration = scheduling_duration - if scheduling_duration == 0: - self._supply = 1 - self.scheduler.register_drone(self) - else: - self._supply = 0 + self._supply = 0 self.jobs = 0 self._allocation = None self._utilisation = None From 71bf0e39926eb719fcc53959e922da915f446fb1 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Fri, 29 Nov 2019 17:30:42 +0100 Subject: [PATCH 331/648] added change fragment for pull request --- docs/source/changelog.rst | 5 +++-- docs/source/changes/76.register_drones.yaml | 7 +++++++ 2 files changed, 10 insertions(+), 2 deletions(-) create mode 100644 docs/source/changes/76.register_drones.yaml diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst index c60e614..44b9793 100644 --- a/docs/source/changelog.rst +++ b/docs/source/changelog.rst @@ -1,4 +1,4 @@ -.. Created by log.py at 2019-11-27, command +.. Created by log.py at 2019-11-29, command '/Users/eileenwork/development/work/lapis/venv/lib/python3.7/site-packages/change/__main__.py log docs/source/changes compile --output docs/source/changelog.rst' based on the format of 'https://keepachangelog.com/' ######### @@ -8,7 +8,7 @@ ChangeLog Upcoming ======== -Version [Unreleased] - 2019-11-27 +Version [Unreleased] - 2019-11-29 +++++++++++++++++++++++++++++++++ * **[Added]** Basic documentation @@ -18,6 +18,7 @@ Version [Unreleased] - 2019-11-27 * **[Changed]** Standardisation of units +* **[Fixed]** Registering of drones * **[Fixed]** Handling of black for pypy * **[Fixed]** Proper termination of simulation * **[Fixed]** Jobs execution within drones diff --git a/docs/source/changes/76.register_drones.yaml b/docs/source/changes/76.register_drones.yaml new file mode 100644 index 0000000..69a88f0 --- /dev/null +++ b/docs/source/changes/76.register_drones.yaml @@ -0,0 +1,7 @@ +category: fixed +summary: "Registering of drones" +description: | + At startup drones were registered twice at the scheduler as the method + ``register_drone`` was called during initialisation and in ``run``. +pull requests: + - 76 From fce90a88d34fc9592f26be71b0c994f0288d2bdd Mon Sep 17 00:00:00 2001 From: Max Fischer Date: Fri, 29 Nov 2019 18:01:54 +0100 Subject: [PATCH 332/648] Apply suggestions from code review --- docs/source/changelog.rst | 2 +- docs/source/changes/76.register_drones.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst index 44b9793..e120589 100644 --- a/docs/source/changelog.rst +++ b/docs/source/changelog.rst @@ -18,7 +18,7 @@ Version [Unreleased] - 2019-11-29 * **[Changed]** Standardisation of units -* **[Fixed]** Registering of drones +* **[Fixed]** Duplicate registration of drones * **[Fixed]** Handling of black for pypy * **[Fixed]** Proper termination of simulation * **[Fixed]** Jobs execution within drones diff --git a/docs/source/changes/76.register_drones.yaml b/docs/source/changes/76.register_drones.yaml index 69a88f0..a0f0c90 100644 --- a/docs/source/changes/76.register_drones.yaml +++ b/docs/source/changes/76.register_drones.yaml @@ -1,5 +1,5 @@ category: fixed -summary: "Registering of drones" +summary: "Duplicate registration of drones" description: | At startup drones were registered twice at the scheduler as the method ``register_drone`` was called during initialisation and in ``run``. From f086f5518f3dd2dc0caaf9779dce001d45fb3e7a Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Sat, 30 Nov 2019 19:43:26 +0100 Subject: [PATCH 333/648] ensured that jobs could be properly removed from job queue when being scheduled --- lapis/scheduler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapis/scheduler.py b/lapis/scheduler.py index dc564b9..b38d53e 100644 --- a/lapis/scheduler.py +++ b/lapis/scheduler.py @@ -88,7 +88,7 @@ async def run(self): async with Scope() as scope: scope.do(self._collect_jobs()) async for _ in interval(self.interval): - for job in self.job_queue: + for job in self.job_queue.copy(): best_match = self._schedule_job(job) if best_match: await best_match.schedule_job(job) From f09cfd918f053f17dda90904a938da0544f92623 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Sat, 30 Nov 2019 19:49:42 +0100 Subject: [PATCH 334/648] Added documentation of change --- docs/source/changelog.rst | 5 +++-- docs/source/changes/79.jobqueue_removal.yaml | 8 ++++++++ 2 files changed, 11 insertions(+), 2 deletions(-) create mode 100644 docs/source/changes/79.jobqueue_removal.yaml diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst index e120589..b201bb3 100644 --- a/docs/source/changelog.rst +++ b/docs/source/changelog.rst @@ -1,4 +1,4 @@ -.. Created by log.py at 2019-11-29, command +.. Created by log.py at 2019-11-30, command '/Users/eileenwork/development/work/lapis/venv/lib/python3.7/site-packages/change/__main__.py log docs/source/changes compile --output docs/source/changelog.rst' based on the format of 'https://keepachangelog.com/' ######### @@ -8,7 +8,7 @@ ChangeLog Upcoming ======== -Version [Unreleased] - 2019-11-29 +Version [Unreleased] - 2019-11-30 +++++++++++++++++++++++++++++++++ * **[Added]** Basic documentation @@ -22,6 +22,7 @@ Version [Unreleased] - 2019-11-29 * **[Fixed]** Handling of black for pypy * **[Fixed]** Proper termination of simulation * **[Fixed]** Jobs execution within drones +* **[Fixed]** Scheduling of jobs * **[Fixed]** Importing of HTCondor jobs 0.3 Series diff --git a/docs/source/changes/79.jobqueue_removal.yaml b/docs/source/changes/79.jobqueue_removal.yaml new file mode 100644 index 0000000..5dd0cf6 --- /dev/null +++ b/docs/source/changes/79.jobqueue_removal.yaml @@ -0,0 +1,8 @@ +category: fixed +summary: "Scheduling of jobs" +description: | + During the scheduling cycle the original job queue was used although jobs + could be removed during scheduling. Now scheduling is performed on a copy + of the job queue. +pull requests: + - 79 From 3f29034f74893be40729c725ff524262030036c3 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Thu, 12 Dec 2019 16:32:50 +0100 Subject: [PATCH 335/648] added await instant to ensure that jobs get correct state when being cancelled --- lapis/drone.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/lapis/drone.py b/lapis/drone.py index 1932537..48142c6 100644 --- a/lapis/drone.py +++ b/lapis/drone.py @@ -137,7 +137,9 @@ async def _run_job(self, job: Job, kill: bool): job.resources[resource_key] < job.used_resources[resource_key] ): + await instant job_execution.cancel() + await instant except KeyError: # check is not relevant if the data is not stored pass @@ -146,9 +148,11 @@ async def _run_job(self, job: Job, kill: bool): except ResourcesUnavailable: await instant job_execution.cancel() + await instant except AssertionError: await instant job_execution.cancel() + await instant self.jobs -= 1 await self.scheduler.job_finished(job) self._utilisation = self._allocation = None From 92989aa91d1a5e03cd3be46e94edac8f96077a13 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Thu, 12 Dec 2019 16:39:12 +0100 Subject: [PATCH 336/648] added change fragment --- docs/source/changelog.rst | 6 ++++-- docs/source/changes/80.job_cancelation.yaml | 8 ++++++++ 2 files changed, 12 insertions(+), 2 deletions(-) create mode 100644 docs/source/changes/80.job_cancelation.yaml diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst index b201bb3..8ea72ab 100644 --- a/docs/source/changelog.rst +++ b/docs/source/changelog.rst @@ -1,4 +1,4 @@ -.. Created by log.py at 2019-11-30, command +.. Created by log.py at 2019-12-12, command '/Users/eileenwork/development/work/lapis/venv/lib/python3.7/site-packages/change/__main__.py log docs/source/changes compile --output docs/source/changelog.rst' based on the format of 'https://keepachangelog.com/' ######### @@ -8,7 +8,7 @@ ChangeLog Upcoming ======== -Version [Unreleased] - 2019-11-30 +Version [Unreleased] - 2019-12-12 +++++++++++++++++++++++++++++++++ * **[Added]** Basic documentation @@ -23,6 +23,7 @@ Version [Unreleased] - 2019-11-30 * **[Fixed]** Proper termination of simulation * **[Fixed]** Jobs execution within drones * **[Fixed]** Scheduling of jobs +* **[Fixed]** Cancelation of jobs * **[Fixed]** Importing of HTCondor jobs 0.3 Series @@ -64,3 +65,4 @@ Version [0.1.1] - 2019-10-24 * **[Fixed]** Calculation of used and requested resource ratio * **[Fixed]** StopIteration handling by Job Generator * **[Fixed]** Importing of SWF files + diff --git a/docs/source/changes/80.job_cancelation.yaml b/docs/source/changes/80.job_cancelation.yaml new file mode 100644 index 0000000..7738cc4 --- /dev/null +++ b/docs/source/changes/80.job_cancelation.yaml @@ -0,0 +1,8 @@ +category: fixed +summary: "Cancelation of jobs" +description: | + When a drone tried to cancel a job it could happen that the success state + of that job was not properly set as the job was not yet in running state. + This is fixed now by additionally waiting for an `instant`. +pull requests: + - 80 From ddc4e3cf9c47755fa8ae2d356da25212fcc65cb4 Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Thu, 31 Oct 2019 16:54:59 +0100 Subject: [PATCH 337/648] added storage object to represent caches and associated readout functionality --- lapis/storage.py | 81 +++++++++++++++++++++++++ lapis/storage_io/__init__.py | 0 lapis/storage_io/storage_information.py | 34 +++++++++++ 3 files changed, 115 insertions(+) create mode 100644 lapis/storage.py create mode 100644 lapis/storage_io/__init__.py create mode 100644 lapis/storage_io/storage_information.py diff --git a/lapis/storage.py b/lapis/storage.py new file mode 100644 index 0000000..fd69e23 --- /dev/null +++ b/lapis/storage.py @@ -0,0 +1,81 @@ +from usim import time + +from typing import Optional + +from lapis.utilities.cache_algorithm_implementations import cache_algorithm +from lapis.utilities.cache_cleanup_implementations import cache_cleanup + + +class Storage(object): + + __slots__ = ("name", "sitename", "storagesize", "usedstorage", "content") + + def __init__( + self, name: str, sitename: str, storagesize: int, content: Optional[dict] = None + ): + self.name = name + self.sitename = sitename + self.storagesize = storagesize + self.content = content + self.usedstorage = self.get_used_storage() + self.describe_state() + + def get_used_storage(self): + return sum(subdict["usedsize"] for subdict in self.content.values()) + + def free_space(self): + return self.storagesize - self.usedstorage + + def place_new_file(self, filerequest: tuple): + filename, filespecs = filerequest + if self.free_space() - filespecs["usedsize"] < 0: + self.make_room(self.free_space() - filespecs["usedsize"]) + self.content.update({filename: filespecs}) + self.content[filename].update( + cachedsince=time.now, lastaccessed=time.now, numberofaccesses=0 + ) + self.usedstorage = self.get_used_storage() + + def update_file(self, filerequest: tuple): + filename, filespecs = filerequest + requested_file = filename + filesize_difference = ( + filespecs["usedsize"] - self.content[requested_file]["usedsize"] + ) + if filesize_difference > 0: + self.make_room(filesize_difference) + self.content[requested_file]["usedsize"] += filesize_difference + self.content[requested_file]["lastaccessed"] = time.now + self.content[requested_file]["numberofaccesses"] += 1 + self.usedstorage = self.get_used_storage() + + def make_room(self, filesize_difference: int): + if self.free_space() - filesize_difference < 0: + cache_cleanup["fifo"](filesize_difference, self) + + def provides_file(self, filerequest: dict): + filename, filespecs = filerequest + if filename in self.content.keys(): + self.update_file(filerequest) + return True + else: + if self.cache_file(): + self.place_new_file(filerequest) + return False + + def cache_file(self): + # cache everything, test different implementations + return cache_algorithm["standard"]() + + def describe_state(self): + print( + "{name} on site {site}: {used}MB of {tot}MB used ({div} %), contains " + "files {filelist}".format( + name=self.name, + site=self.sitename, + used=self.usedstorage, + tot=self.storagesize, + div=100.0 * self.usedstorage / self.storagesize, + filelist=", ".join(self.content.keys()), + ) + ) diff --git a/lapis/storage_io/__init__.py b/lapis/storage_io/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/lapis/storage_io/storage_information.py b/lapis/storage_io/storage_information.py new file mode 100644 index 0000000..5829816 --- /dev/null +++ b/lapis/storage_io/storage_information.py @@ -0,0 +1,34 @@ +import csv +from lapis.storage import Storage + + +def storage_reader(storage, storage_content): + storage_content = storage_content_reader(storage_content) + reader = csv.DictReader(storage, delimiter=" ", quotechar="'") + for row in reader: + yield Storage( + name=row["name"], + sitename=row["sitename"], + storagesize=int(row["cachesizeMB"]), + content=storage_content[row["name"]], + ) + + +def storage_content_reader(file_name): + reader = csv.DictReader(file_name, delimiter=" ", quotechar="'") + cache_information = dict() + for row in reader: + if row["cachename"] not in cache_information.keys(): + cache_information[row["cachename"]] = dict() + cache_information[row["cachename"]][row["filename"]] = dict() + for key in [ + "filesize", + "usedsize", + "cachedsince", + "lastaccessed", + "numberofaccesses", + ]: + cache_information[row["cachename"]][row["filename"]][key] = int(row[key]) + if not cache_information: + cache_information = None + return cache_information From fd15e26f391d152b3e59e4452bd9c17334f2e47f Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Thu, 31 Oct 2019 16:56:38 +0100 Subject: [PATCH 338/648] extended CLI to support storage files --- lapis/cli/simulate.py | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/lapis/cli/simulate.py b/lapis/cli/simulate.py index aa176ee..a4c98c3 100644 --- a/lapis/cli/simulate.py +++ b/lapis/cli/simulate.py @@ -6,9 +6,11 @@ from lapis.controller import SimulatedLinearController from lapis.job_io.htcondor import htcondor_job_reader + from lapis.pool import StaticPool, Pool from lapis.pool_io.htcondor import htcondor_pool_reader from lapis.job_io.swf import swf_job_reader +from lapis.storage_io.storage_information import storage_reader from lapis.scheduler import CondorJobScheduler from lapis.simulator import Simulator @@ -25,6 +27,8 @@ pool_import_mapper = {"htcondor": htcondor_pool_reader} +storage_import_mapper = {"standard": storage_reader} + @click.group() @click.option("--seed", type=int, default=1234) @@ -71,8 +75,17 @@ def cli(ctx, seed, until, log_tcp, log_file, log_telegraf): type=(click.File("r"), click.Choice(list(pool_import_mapper.keys()))), multiple=True, ) +@click.option( + "--storage-files", + "storage_files", + type=( + click.File("r"), + click.File("r"), + click.Choice(list(storage_import_mapper.keys())), + ), +) @click.pass_context -def static(ctx, job_file, pool_file): +def static(ctx, job_file, pool_file, storage_files): click.echo("starting static environment") simulator = Simulator(seed=ctx.obj["seed"]) file, file_type = job_file @@ -87,6 +100,12 @@ def static(ctx, job_file, pool_file): pool_reader=pool_import_mapper[pool_file_type], pool_type=StaticPool, ) + storage_file, storage_content_file, storage_type = storage_files + simulator.create_storage( + storage_input=storage_file, + storage_content_input=storage_content_file, + storage_reader=storage_import_mapper[storage_type], + ) simulator.run(until=ctx.obj["until"]) From 99a926e9c554f865b9ad7bccae4bd5ba81005b7a Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Thu, 31 Oct 2019 17:31:03 +0100 Subject: [PATCH 339/648] extended simulator to support storage files --- lapis/simulator.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/lapis/simulator.py b/lapis/simulator.py index 2920202..53fb0d7 100644 --- a/lapis/simulator.py +++ b/lapis/simulator.py @@ -6,6 +6,7 @@ from lapis.drone import Drone from lapis.job import job_to_queue_scheduler +from lapis.file_provider import FileProvider from lapis.monitor.general import ( user_demand, job_statistics, @@ -26,6 +27,8 @@ def __init__(self, seed=1234): random.seed(seed) self.job_queue = Queue() self.pools = [] + self.storage_list = [] + self.fileprovider = FileProvider() self.controllers = [] self.job_scheduler = None self.job_generator = None @@ -60,8 +63,17 @@ def create_pools(self, pool_input, pool_reader, pool_type, controller=None): if controller: self.controllers.append(controller(target=pool, rate=1)) + def create_storage(self, storage_input, storage_content_input, storage_reader): + for storage in storage_reader( + storage=storage_input, storage_content=storage_content_input + ): + self.storage_list.append(storage) + self.fileprovider.add_storage_element(storage) + def create_scheduler(self, scheduler_type): - self.job_scheduler = scheduler_type(job_queue=self.job_queue) + self.job_scheduler = scheduler_type( + job_queue=self.job_queue, fileprovider=self.fileprovider + ) def run(self, until=None): print(f"running until {until}") From d3c1581fb352e718617bf05abcc1da4a6e0f9c05 Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Thu, 31 Oct 2019 17:32:49 +0100 Subject: [PATCH 340/648] added new drone attribute sitename connecting drones and storage elements --- lapis/drone.py | 7 +++++++ lapis/pool_io/htcondor.py | 1 + 2 files changed, 8 insertions(+) diff --git a/lapis/drone.py b/lapis/drone.py index 48142c6..9f7fcc5 100644 --- a/lapis/drone.py +++ b/lapis/drone.py @@ -15,6 +15,7 @@ def __init__( pool_resources: dict, scheduling_duration: float, ignore_resources: list = None, + sitename: str = None, ): """ :param scheduler: @@ -23,6 +24,7 @@ def __init__( """ super(Drone, self).__init__() self.scheduler = scheduler + self.sitename = sitename self.pool_resources = pool_resources self.resources = Capacities(**pool_resources) # shadowing requested resources to determine jobs to be killed @@ -145,6 +147,11 @@ async def _run_job(self, job: Job, kill: bool): pass self.scheduler.update_drone(self) await job_execution.done + print( + "finished job {} on drone {} @ {}".format( + repr(job), repr(self), time.now + ) + ) except ResourcesUnavailable: await instant job_execution.cancel() diff --git a/lapis/pool_io/htcondor.py b/lapis/pool_io/htcondor.py index 0dba5c1..84fd948 100644 --- a/lapis/pool_io/htcondor.py +++ b/lapis/pool_io/htcondor.py @@ -48,5 +48,6 @@ def htcondor_pool_reader( for key, value in resource_name_mapping.items() }, ignore_resources=["disk"], + sitename=row.get("sitename", None), ), ) From ebbe71ce8a5862fad6e688d1c9b6ccc88c5816da Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Thu, 31 Oct 2019 17:35:45 +0100 Subject: [PATCH 341/648] added file provider object connecting storage objects and jobs --- lapis/file_provider.py | 37 +++++++++++++++++++++++++++++++++++++ lapis/job.py | 21 +++++++++++++++++---- lapis/scheduler.py | 13 ++++++++++--- 3 files changed, 64 insertions(+), 7 deletions(-) create mode 100644 lapis/file_provider.py diff --git a/lapis/file_provider.py b/lapis/file_provider.py new file mode 100644 index 0000000..044ae6b --- /dev/null +++ b/lapis/file_provider.py @@ -0,0 +1,37 @@ +from lapis.storage import Storage + + +class FileProvider(object): + + __slots__ = ("storages",) + + def __init__(self): + self.storages = dict() + + def add_storage_element(self, storage_element: Storage): + try: + self.storages[storage_element.sitename].append(storage_element) + except KeyError: + self.storages[storage_element.sitename] = [storage_element] + + def provides_all_files(self, job): + """ + Dummy implementation, to be replaced: if a part of every inputfile of the job is + provided by a storage element located on the same site as the drone the job + is running on this function returns True + :param job: + :return: + """ + provided_storages = self.storages.get(job.drone.sitename, None) + if provided_storages: + for inputfilename, inputfilespecs in job.inputfiles.items(): + provides_inputfile = 0 + for storage in provided_storages: + provides_inputfile += storage.provides_file( + (inputfilename, inputfilespecs) + ) + if not provides_inputfile: + return False + return True + else: + return False diff --git a/lapis/job.py b/lapis/job.py index c4627e0..e65f870 100644 --- a/lapis/job.py +++ b/lapis/job.py @@ -5,6 +5,7 @@ from usim import CancelTask from lapis.monitor import sampling_required +from lapis.utilities.walltime_models import walltime_models if TYPE_CHECKING: from lapis.drone import Drone @@ -23,6 +24,7 @@ class Job(object): "in_queue_until", "_name", "drone", + "fileprovider", "_success", ) @@ -68,6 +70,7 @@ def __init__( self.in_queue_since = in_queue_since self.in_queue_until = None self.drone = drone + self.fileprovider = None self._name = name self._success: Optional[bool] = None @@ -91,14 +94,24 @@ def waiting_time(self) -> float: return self.in_queue_until - self.in_queue_since return float("Inf") - async def run(self, drone: "Drone"): - assert drone, "Jobs cannot run without a drone being assigned" - self.drone = drone + def modified_walltime(self): + if self.fileprovider.provides_all_files(self): + return walltime_models["maxeff"](self) + else: + return self.walltime + + async def run(self): self.in_queue_until = time.now self._success = None await sampling_required.put(self) + print( + "running job {} on site {} in drone {}".format( + repr(self), self.drone.sitename, repr(self.drone) + ) + ) + walltime = self.modified_walltime() try: - await (time + self.walltime) + await (time + walltime) except CancelTask: self.drone = None self._success = False diff --git a/lapis/scheduler.py b/lapis/scheduler.py index b38d53e..36bbb77 100644 --- a/lapis/scheduler.py +++ b/lapis/scheduler.py @@ -1,5 +1,5 @@ from typing import Dict -from usim import Scope, interval, Resources +from usim import Scope, interval, time from lapis.drone import Drone from lapis.monitor import sampling_required @@ -26,9 +26,10 @@ class CondorJobScheduler(object): :return: """ - def __init__(self, job_queue): + def __init__(self, job_queue, fileprovider): self._stream_queue = job_queue self.drone_cluster = [] + self.fileprovider = fileprovider self.interval = 60 self.job_queue = JobQueue() self._collecting = True @@ -91,7 +92,13 @@ async def run(self): for job in self.job_queue.copy(): best_match = self._schedule_job(job) if best_match: - await best_match.schedule_job(job) + job.fileprovider = self.fileprovider + print( + "start job {} on drone {} @ {}".format( + repr(job), repr(best_match), time.now + ) + ) + scope.do(best_match.start_job(job)) self.job_queue.remove(job) await sampling_required.put(self.job_queue) self.unregister_drone(best_match) From d1b1ddeb799bcecdebfe692c23a4e78eb28fb1a4 Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Thu, 31 Oct 2019 17:37:04 +0100 Subject: [PATCH 342/648] added different caching/cache cleaning/walltime recalculation algorithms --- .../cache_algorithm_implementations.py | 5 ++++ .../cache_cleanup_implementations.py | 25 +++++++++++++++++++ lapis/utilities/walltime_models.py | 6 +++++ 3 files changed, 36 insertions(+) create mode 100644 lapis/utilities/cache_algorithm_implementations.py create mode 100644 lapis/utilities/cache_cleanup_implementations.py create mode 100644 lapis/utilities/walltime_models.py diff --git a/lapis/utilities/cache_algorithm_implementations.py b/lapis/utilities/cache_algorithm_implementations.py new file mode 100644 index 0000000..23b1ff0 --- /dev/null +++ b/lapis/utilities/cache_algorithm_implementations.py @@ -0,0 +1,5 @@ +def cache_all(): + return True + + +cache_algorithm = {"standard": cache_all} diff --git a/lapis/utilities/cache_cleanup_implementations.py b/lapis/utilities/cache_cleanup_implementations.py new file mode 100644 index 0000000..90c4e6e --- /dev/null +++ b/lapis/utilities/cache_cleanup_implementations.py @@ -0,0 +1,25 @@ +def fifo(size, storage): + # FIFO, test different implementations + sorted_content = sorted( + list(storage.content.items()), key=lambda x: x[1]["cachedsince"] + ) + while size < 0: + size += sorted_content[0][1]["cachedsizeMB"] + storage.content.pop(sorted_content[0][0]) + storage.usedstorage -= sorted_content[0][1]["cachedsizeMB"] + sorted_content.pop(0) + + +def last_accessed(size, storage): + # FIFO, test different implementations + sorted_content = sorted( + list(storage.content.items()), key=lambda x: x[1]["lastaccessed"] + ) + while size < 0: + size += sorted_content[0][1]["cachedsizeMB"] + storage.content.pop(sorted_content[0][0]) + storage.usedstorage -= sorted_content[0][1]["cachedsizeMB"] + sorted_content.pop(0) + + +cache_cleanup = {"fifo": fifo, "lastaccessed": last_accessed} diff --git a/lapis/utilities/walltime_models.py b/lapis/utilities/walltime_models.py new file mode 100644 index 0000000..d214c25 --- /dev/null +++ b/lapis/utilities/walltime_models.py @@ -0,0 +1,6 @@ +def extrapolate_walltime_to_maximal_efficiency(job, maximal_efficiency: float = 0.8): + return (job.used_resources["cores"] / maximal_efficiency) * job.walltime + + +# TODO: add models depending on fraction of cached files etc +walltime_models = {"maxeff": extrapolate_walltime_to_maximal_efficiency} From e1896ebd8522a5fab9a67e68457897ea6d6ec3b7 Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Thu, 31 Oct 2019 20:00:58 +0100 Subject: [PATCH 343/648] renamed storage readout --- lapis/cli/simulate.py | 2 +- lapis/storage_io/storage.py | 34 ++++++++++++++++++++++++++++++++++ 2 files changed, 35 insertions(+), 1 deletion(-) create mode 100644 lapis/storage_io/storage.py diff --git a/lapis/cli/simulate.py b/lapis/cli/simulate.py index a4c98c3..8c41848 100644 --- a/lapis/cli/simulate.py +++ b/lapis/cli/simulate.py @@ -10,7 +10,7 @@ from lapis.pool import StaticPool, Pool from lapis.pool_io.htcondor import htcondor_pool_reader from lapis.job_io.swf import swf_job_reader -from lapis.storage_io.storage_information import storage_reader +from lapis.storage_io.storage import storage_reader from lapis.scheduler import CondorJobScheduler from lapis.simulator import Simulator diff --git a/lapis/storage_io/storage.py b/lapis/storage_io/storage.py new file mode 100644 index 0000000..5829816 --- /dev/null +++ b/lapis/storage_io/storage.py @@ -0,0 +1,34 @@ +import csv +from lapis.storage import Storage + + +def storage_reader(storage, storage_content): + storage_content = storage_content_reader(storage_content) + reader = csv.DictReader(storage, delimiter=" ", quotechar="'") + for row in reader: + yield Storage( + name=row["name"], + sitename=row["sitename"], + storagesize=int(row["cachesizeMB"]), + content=storage_content[row["name"]], + ) + + +def storage_content_reader(file_name): + reader = csv.DictReader(file_name, delimiter=" ", quotechar="'") + cache_information = dict() + for row in reader: + if row["cachename"] not in cache_information.keys(): + cache_information[row["cachename"]] = dict() + cache_information[row["cachename"]][row["filename"]] = dict() + for key in [ + "filesize", + "usedsize", + "cachedsince", + "lastaccessed", + "numberofaccesses", + ]: + cache_information[row["cachename"]][row["filename"]][key] = int(row[key]) + if not cache_information: + cache_information = None + return cache_information From 1f8ef36bc40de1d4fdb0e2fa26523381392908ec Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Fri, 1 Nov 2019 12:32:46 +0100 Subject: [PATCH 344/648] fixed debug output --- lapis/job.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/lapis/job.py b/lapis/job.py index e65f870..d7f6a87 100644 --- a/lapis/job.py +++ b/lapis/job.py @@ -104,11 +104,12 @@ async def run(self): self.in_queue_until = time.now self._success = None await sampling_required.put(self) - print( - "running job {} on site {} in drone {}".format( - repr(self), self.drone.sitename, repr(self.drone) + if self.drone: + print( + "running job {} on site {} in drone {}".format( + repr(self), self.drone.sitename, repr(self.drone) + ) ) - ) walltime = self.modified_walltime() try: await (time + walltime) From 7d1b635ecabdd02058ad531b6486f174fd8b5480 Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Fri, 1 Nov 2019 12:34:18 +0100 Subject: [PATCH 345/648] renamed storage input reader --- lapis/storage_io/storage_information.py | 34 ------------------------- 1 file changed, 34 deletions(-) delete mode 100644 lapis/storage_io/storage_information.py diff --git a/lapis/storage_io/storage_information.py b/lapis/storage_io/storage_information.py deleted file mode 100644 index 5829816..0000000 --- a/lapis/storage_io/storage_information.py +++ /dev/null @@ -1,34 +0,0 @@ -import csv -from lapis.storage import Storage - - -def storage_reader(storage, storage_content): - storage_content = storage_content_reader(storage_content) - reader = csv.DictReader(storage, delimiter=" ", quotechar="'") - for row in reader: - yield Storage( - name=row["name"], - sitename=row["sitename"], - storagesize=int(row["cachesizeMB"]), - content=storage_content[row["name"]], - ) - - -def storage_content_reader(file_name): - reader = csv.DictReader(file_name, delimiter=" ", quotechar="'") - cache_information = dict() - for row in reader: - if row["cachename"] not in cache_information.keys(): - cache_information[row["cachename"]] = dict() - cache_information[row["cachename"]][row["filename"]] = dict() - for key in [ - "filesize", - "usedsize", - "cachedsince", - "lastaccessed", - "numberofaccesses", - ]: - cache_information[row["cachename"]][row["filename"]][key] = int(row[key]) - if not cache_information: - cache_information = None - return cache_information From c76e984c79a21dc6a6e3a61b000fd83b4b8e1cfa Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Sat, 2 Nov 2019 12:49:55 +0100 Subject: [PATCH 346/648] updated Job class --- lapis/job.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapis/job.py b/lapis/job.py index d7f6a87..b4ed798 100644 --- a/lapis/job.py +++ b/lapis/job.py @@ -95,7 +95,7 @@ def waiting_time(self) -> float: return float("Inf") def modified_walltime(self): - if self.fileprovider.provides_all_files(self): + if self.fileprovider and self.fileprovider.provides_all_files(self): return walltime_models["maxeff"](self) else: return self.walltime From 6275aae9267ef52caec3eaec1fccba31cd1f8642 Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Sat, 2 Nov 2019 16:41:21 +0100 Subject: [PATCH 347/648] replaced function modifying walltime by function with property decorator --- lapis/job.py | 14 +++++++------- lapis/utilities/walltime_models.py | 10 ++++++++-- 2 files changed, 15 insertions(+), 9 deletions(-) diff --git a/lapis/job.py b/lapis/job.py index b4ed798..75f5374 100644 --- a/lapis/job.py +++ b/lapis/job.py @@ -15,7 +15,7 @@ class Job(object): __slots__ = ( "resources", "used_resources", - "walltime", + "_walltime", "requested_walltime", "queue_date", "requested_inputfiles", @@ -61,7 +61,7 @@ def __init__( self.used_resources[key], ) self.resources[key] = self.used_resources[key] - self.walltime = used_resources.pop("walltime") + self._walltime = used_resources.pop("walltime") self.requested_walltime = resources.pop("walltime", None) self.requested_inputfiles = resources.pop("inputfiles", None) self.used_inputfiles = used_resources.pop("inputfiles", None) @@ -94,11 +94,12 @@ def waiting_time(self) -> float: return self.in_queue_until - self.in_queue_since return float("Inf") - def modified_walltime(self): + @property + def walltime(self): if self.fileprovider and self.fileprovider.provides_all_files(self): - return walltime_models["maxeff"](self) + return walltime_models["maxeff"](self, self._walltime) else: - return self.walltime + return self._walltime async def run(self): self.in_queue_until = time.now @@ -110,9 +111,8 @@ async def run(self): repr(self), self.drone.sitename, repr(self.drone) ) ) - walltime = self.modified_walltime() try: - await (time + walltime) + await (time + self.walltime) except CancelTask: self.drone = None self._success = False diff --git a/lapis/utilities/walltime_models.py b/lapis/utilities/walltime_models.py index d214c25..bd2083a 100644 --- a/lapis/utilities/walltime_models.py +++ b/lapis/utilities/walltime_models.py @@ -1,5 +1,11 @@ -def extrapolate_walltime_to_maximal_efficiency(job, maximal_efficiency: float = 0.8): - return (job.used_resources["cores"] / maximal_efficiency) * job.walltime +from lapis.job import Job + + +def extrapolate_walltime_to_maximal_efficiency( + job: Job, original_walltime, maximal_efficiency: float = 0.8 +): + + return (job.used_resources["cores"] / maximal_efficiency) * original_walltime # TODO: add models depending on fraction of cached files etc From 2f1e22323cca01fb1f63e93e2b6a9fc902a426e3 Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Sat, 2 Nov 2019 16:45:25 +0100 Subject: [PATCH 348/648] Revert "replaced function modifying walltime by function with property decorator" This reverts commit f9972234ddbcd5d073f51a085b70752146e356d1. --- lapis/job.py | 14 +++++++------- lapis/utilities/walltime_models.py | 10 ++-------- 2 files changed, 9 insertions(+), 15 deletions(-) diff --git a/lapis/job.py b/lapis/job.py index 75f5374..b4ed798 100644 --- a/lapis/job.py +++ b/lapis/job.py @@ -15,7 +15,7 @@ class Job(object): __slots__ = ( "resources", "used_resources", - "_walltime", + "walltime", "requested_walltime", "queue_date", "requested_inputfiles", @@ -61,7 +61,7 @@ def __init__( self.used_resources[key], ) self.resources[key] = self.used_resources[key] - self._walltime = used_resources.pop("walltime") + self.walltime = used_resources.pop("walltime") self.requested_walltime = resources.pop("walltime", None) self.requested_inputfiles = resources.pop("inputfiles", None) self.used_inputfiles = used_resources.pop("inputfiles", None) @@ -94,12 +94,11 @@ def waiting_time(self) -> float: return self.in_queue_until - self.in_queue_since return float("Inf") - @property - def walltime(self): + def modified_walltime(self): if self.fileprovider and self.fileprovider.provides_all_files(self): - return walltime_models["maxeff"](self, self._walltime) + return walltime_models["maxeff"](self) else: - return self._walltime + return self.walltime async def run(self): self.in_queue_until = time.now @@ -111,8 +110,9 @@ async def run(self): repr(self), self.drone.sitename, repr(self.drone) ) ) + walltime = self.modified_walltime() try: - await (time + self.walltime) + await (time + walltime) except CancelTask: self.drone = None self._success = False diff --git a/lapis/utilities/walltime_models.py b/lapis/utilities/walltime_models.py index bd2083a..d214c25 100644 --- a/lapis/utilities/walltime_models.py +++ b/lapis/utilities/walltime_models.py @@ -1,11 +1,5 @@ -from lapis.job import Job - - -def extrapolate_walltime_to_maximal_efficiency( - job: Job, original_walltime, maximal_efficiency: float = 0.8 -): - - return (job.used_resources["cores"] / maximal_efficiency) * original_walltime +def extrapolate_walltime_to_maximal_efficiency(job, maximal_efficiency: float = 0.8): + return (job.used_resources["cores"] / maximal_efficiency) * job.walltime # TODO: add models depending on fraction of cached files etc From ed5d2321f4ff2dd716d530c0a5276bacd6a3c069 Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Sat, 2 Nov 2019 17:07:31 +0100 Subject: [PATCH 349/648] replaced function modifying walltime by function with property decorator --- lapis/job.py | 14 +++++++------- lapis/utilities/walltime_models.py | 6 ++++-- 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/lapis/job.py b/lapis/job.py index b4ed798..75f5374 100644 --- a/lapis/job.py +++ b/lapis/job.py @@ -15,7 +15,7 @@ class Job(object): __slots__ = ( "resources", "used_resources", - "walltime", + "_walltime", "requested_walltime", "queue_date", "requested_inputfiles", @@ -61,7 +61,7 @@ def __init__( self.used_resources[key], ) self.resources[key] = self.used_resources[key] - self.walltime = used_resources.pop("walltime") + self._walltime = used_resources.pop("walltime") self.requested_walltime = resources.pop("walltime", None) self.requested_inputfiles = resources.pop("inputfiles", None) self.used_inputfiles = used_resources.pop("inputfiles", None) @@ -94,11 +94,12 @@ def waiting_time(self) -> float: return self.in_queue_until - self.in_queue_since return float("Inf") - def modified_walltime(self): + @property + def walltime(self): if self.fileprovider and self.fileprovider.provides_all_files(self): - return walltime_models["maxeff"](self) + return walltime_models["maxeff"](self, self._walltime) else: - return self.walltime + return self._walltime async def run(self): self.in_queue_until = time.now @@ -110,9 +111,8 @@ async def run(self): repr(self), self.drone.sitename, repr(self.drone) ) ) - walltime = self.modified_walltime() try: - await (time + walltime) + await (time + self.walltime) except CancelTask: self.drone = None self._success = False diff --git a/lapis/utilities/walltime_models.py b/lapis/utilities/walltime_models.py index d214c25..b6e5dae 100644 --- a/lapis/utilities/walltime_models.py +++ b/lapis/utilities/walltime_models.py @@ -1,5 +1,7 @@ -def extrapolate_walltime_to_maximal_efficiency(job, maximal_efficiency: float = 0.8): - return (job.used_resources["cores"] / maximal_efficiency) * job.walltime +def extrapolate_walltime_to_maximal_efficiency( + job, original_walltime, maximal_efficiency: float = 0.8 +): + return (job.used_resources["cores"] / maximal_efficiency) * original_walltime # TODO: add models depending on fraction of cached files etc From 2eace5e79dd2eba6cdda70376e26947f9ae3f88d Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Sat, 2 Nov 2019 17:09:56 +0100 Subject: [PATCH 350/648] resolving PEP8 issues --- lapis/utilities/walltime_models.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lapis/utilities/walltime_models.py b/lapis/utilities/walltime_models.py index b6e5dae..7484b6a 100644 --- a/lapis/utilities/walltime_models.py +++ b/lapis/utilities/walltime_models.py @@ -1,3 +1,5 @@ + + def extrapolate_walltime_to_maximal_efficiency( job, original_walltime, maximal_efficiency: float = 0.8 ): From 0fc6a4ce8833260a302b7a8814b861d8d0ef23f0 Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Sat, 2 Nov 2019 20:56:46 +0100 Subject: [PATCH 351/648] fixed file provider bug (wrong inputfiles dictionary) --- lapis/file_provider.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapis/file_provider.py b/lapis/file_provider.py index 044ae6b..49ff321 100644 --- a/lapis/file_provider.py +++ b/lapis/file_provider.py @@ -24,7 +24,7 @@ def provides_all_files(self, job): """ provided_storages = self.storages.get(job.drone.sitename, None) if provided_storages: - for inputfilename, inputfilespecs in job.inputfiles.items(): + for inputfilename, inputfilespecs in job.used_inputfiles.items(): provides_inputfile = 0 for storage in provided_storages: provides_inputfile += storage.provides_file( From 840868f736af83e0c3ceec2c9a89721c8c1cd542 Mon Sep 17 00:00:00 2001 From: tfesenbecker <50665055+tfesenbecker@users.noreply.github.com> Date: Mon, 4 Nov 2019 09:23:18 +0100 Subject: [PATCH 352/648] Update lapis/cli/simulate.py Co-Authored-By: Eileen Kuehn --- lapis/cli/simulate.py | 1 - 1 file changed, 1 deletion(-) diff --git a/lapis/cli/simulate.py b/lapis/cli/simulate.py index 8c41848..85d6854 100644 --- a/lapis/cli/simulate.py +++ b/lapis/cli/simulate.py @@ -6,7 +6,6 @@ from lapis.controller import SimulatedLinearController from lapis.job_io.htcondor import htcondor_job_reader - from lapis.pool import StaticPool, Pool from lapis.pool_io.htcondor import htcondor_pool_reader from lapis.job_io.swf import swf_job_reader From 4f18248d90391a665ca372bd0afd83c17efd7970 Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Mon, 4 Nov 2019 13:39:16 +0100 Subject: [PATCH 353/648] renamed function get_used_storage to _calculate_used_storage --- lapis/storage.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/lapis/storage.py b/lapis/storage.py index fd69e23..9f11442 100644 --- a/lapis/storage.py +++ b/lapis/storage.py @@ -17,10 +17,10 @@ def __init__( self.sitename = sitename self.storagesize = storagesize self.content = content - self.usedstorage = self.get_used_storage() + self.usedstorage = self._calculate_used_storage() self.describe_state() - def get_used_storage(self): + def _calculate_used_storage(self): return sum(subdict["usedsize"] for subdict in self.content.values()) def free_space(self): @@ -34,7 +34,7 @@ def place_new_file(self, filerequest: tuple): self.content[filename].update( cachedsince=time.now, lastaccessed=time.now, numberofaccesses=0 ) - self.usedstorage = self.get_used_storage() + self.usedstorage = self._calculate_used_storage() def update_file(self, filerequest: tuple): filename, filespecs = filerequest @@ -47,7 +47,7 @@ def update_file(self, filerequest: tuple): self.content[requested_file]["usedsize"] += filesize_difference self.content[requested_file]["lastaccessed"] = time.now self.content[requested_file]["numberofaccesses"] += 1 - self.usedstorage = self.get_used_storage() + self.usedstorage = self._calculate_used_storage() def make_room(self, filesize_difference: int): if self.free_space() - filesize_difference < 0: From 1658b5b8fd6ad07f856bf26e01f8a4f19fd678a7 Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Mon, 4 Nov 2019 16:52:38 +0100 Subject: [PATCH 354/648] attached fileprovider to drone instead of job and passed it via make_drone instead of scheduler --- lapis/drone.py | 2 ++ lapis/job.py | 6 +++--- lapis/scheduler.py | 4 +--- lapis/simulator.py | 8 ++------ 4 files changed, 8 insertions(+), 12 deletions(-) diff --git a/lapis/drone.py b/lapis/drone.py index 9f7fcc5..5e7541f 100644 --- a/lapis/drone.py +++ b/lapis/drone.py @@ -12,6 +12,7 @@ class Drone(interfaces.Pool): def __init__( self, scheduler, + fileprovider, pool_resources: dict, scheduling_duration: float, ignore_resources: list = None, @@ -24,6 +25,7 @@ def __init__( """ super(Drone, self).__init__() self.scheduler = scheduler + self.fileprovider = fileprovider self.sitename = sitename self.pool_resources = pool_resources self.resources = Capacities(**pool_resources) diff --git a/lapis/job.py b/lapis/job.py index 75f5374..bf87805 100644 --- a/lapis/job.py +++ b/lapis/job.py @@ -24,7 +24,6 @@ class Job(object): "in_queue_until", "_name", "drone", - "fileprovider", "_success", ) @@ -70,7 +69,6 @@ def __init__( self.in_queue_since = in_queue_since self.in_queue_until = None self.drone = drone - self.fileprovider = None self._name = name self._success: Optional[bool] = None @@ -96,7 +94,9 @@ def waiting_time(self) -> float: @property def walltime(self): - if self.fileprovider and self.fileprovider.provides_all_files(self): + if self.drone.fileprovider and self.drone.fileprovider.input_file_coverage( + self.drone.sitename, self.used_inputfiles + ): return walltime_models["maxeff"](self, self._walltime) else: return self._walltime diff --git a/lapis/scheduler.py b/lapis/scheduler.py index 36bbb77..d20ad20 100644 --- a/lapis/scheduler.py +++ b/lapis/scheduler.py @@ -26,10 +26,9 @@ class CondorJobScheduler(object): :return: """ - def __init__(self, job_queue, fileprovider): + def __init__(self, job_queue): self._stream_queue = job_queue self.drone_cluster = [] - self.fileprovider = fileprovider self.interval = 60 self.job_queue = JobQueue() self._collecting = True @@ -92,7 +91,6 @@ async def run(self): for job in self.job_queue.copy(): best_match = self._schedule_job(job) if best_match: - job.fileprovider = self.fileprovider print( "start job {} on drone {} @ {}".format( repr(job), repr(best_match), time.now diff --git a/lapis/simulator.py b/lapis/simulator.py index 53fb0d7..83bc653 100644 --- a/lapis/simulator.py +++ b/lapis/simulator.py @@ -27,7 +27,6 @@ def __init__(self, seed=1234): random.seed(seed) self.job_queue = Queue() self.pools = [] - self.storage_list = [] self.fileprovider = FileProvider() self.controllers = [] self.job_scheduler = None @@ -57,7 +56,7 @@ def create_pools(self, pool_input, pool_reader, pool_type, controller=None): for pool in pool_reader( iterable=pool_input, pool_type=pool_type, - make_drone=partial(Drone, self.job_scheduler), + make_drone=partial(Drone, self.job_scheduler, self.fileprovider), ): self.pools.append(pool) if controller: @@ -67,13 +66,10 @@ def create_storage(self, storage_input, storage_content_input, storage_reader): for storage in storage_reader( storage=storage_input, storage_content=storage_content_input ): - self.storage_list.append(storage) self.fileprovider.add_storage_element(storage) def create_scheduler(self, scheduler_type): - self.job_scheduler = scheduler_type( - job_queue=self.job_queue, fileprovider=self.fileprovider - ) + self.job_scheduler = scheduler_type(job_queue=self.job_queue) def run(self, until=None): print(f"running until {until}") From c07deb3bdb178eb99a441db8cc01b9528a99902f Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Mon, 4 Nov 2019 16:56:49 +0100 Subject: [PATCH 355/648] reworked file coverage function to return a score --- lapis/file_provider.py | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/lapis/file_provider.py b/lapis/file_provider.py index 49ff321..ccbd417 100644 --- a/lapis/file_provider.py +++ b/lapis/file_provider.py @@ -1,4 +1,5 @@ from lapis.storage import Storage +from typing import Optional class FileProvider(object): @@ -14,24 +15,25 @@ def add_storage_element(self, storage_element: Storage): except KeyError: self.storages[storage_element.sitename] = [storage_element] - def provides_all_files(self, job): + def input_file_coverage( + self, dronesite: str, requested_files: Optional[dict] = None + ) -> float: """ - Dummy implementation, to be replaced: if a part of every inputfile of the job is - provided by a storage element located on the same site as the drone the job - is running on this function returns True - :param job: + Dummy implementation, to be replaced + + :param requested_files: + :param dronesite: :return: """ - provided_storages = self.storages.get(job.drone.sitename, None) + provided_storages = self.storages.get(dronesite, None) if provided_storages: - for inputfilename, inputfilespecs in job.used_inputfiles.items(): - provides_inputfile = 0 + provides_inputfile = [] + for inputfilename, inputfilespecs in requested_files.items(): + provides_inputfile.append(0) for storage in provided_storages: - provides_inputfile += storage.provides_file( + provides_inputfile[-1] += storage.provides_file( (inputfilename, inputfilespecs) ) - if not provides_inputfile: - return False - return True + return 1 - provided_storages.count(0) / len(provided_storages) else: - return False + return 0 From adad50f4054b5d7d57b4e2d805ab5f5c3d374718 Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Mon, 4 Nov 2019 16:58:02 +0100 Subject: [PATCH 356/648] added proper __repr__ function --- lapis/storage.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/lapis/storage.py b/lapis/storage.py index 9f11442..4c67c48 100644 --- a/lapis/storage.py +++ b/lapis/storage.py @@ -18,7 +18,8 @@ def __init__( self.storagesize = storagesize self.content = content self.usedstorage = self._calculate_used_storage() - self.describe_state() + self.__repr__() + print(self.sitename) def _calculate_used_storage(self): return sum(subdict["usedsize"] for subdict in self.content.values()) @@ -26,19 +27,18 @@ def _calculate_used_storage(self): def free_space(self): return self.storagesize - self.usedstorage - def place_new_file(self, filerequest: tuple): - filename, filespecs = filerequest + def add_file(self, filename: str, filespecs: tuple): + assert filename not in self.content.keys() if self.free_space() - filespecs["usedsize"] < 0: self.make_room(self.free_space() - filespecs["usedsize"]) - self.content.update({filename: filespecs}) + self.content[filename] = filespecs self.content[filename].update( cachedsince=time.now, lastaccessed=time.now, numberofaccesses=0 ) self.usedstorage = self._calculate_used_storage() def update_file(self, filerequest: tuple): - filename, filespecs = filerequest - requested_file = filename + requested_file, filespecs = filerequest filesize_difference = ( filespecs["usedsize"] - self.content[requested_file]["usedsize"] ) @@ -60,15 +60,15 @@ def provides_file(self, filerequest: dict): return True else: if self.cache_file(): - self.place_new_file(filerequest) + self.add_file(filename, filespecs) return False def cache_file(self): # cache everything, test different implementations return cache_algorithm["standard"]() - def describe_state(self): - print( + def __repr__(self): + return ( "{name} on site {site}: {used}MB of {tot}MB used ({div} %), contains " "files {filelist}".format( name=self.name, From 332923c281c1226b7b79ff18ccc1f2ec83194f24 Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Thu, 7 Nov 2019 14:33:40 +0100 Subject: [PATCH 357/648] added file classes --- lapis/files.py | 60 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) create mode 100644 lapis/files.py diff --git a/lapis/files.py b/lapis/files.py new file mode 100644 index 0000000..0bb8cd7 --- /dev/null +++ b/lapis/files.py @@ -0,0 +1,60 @@ +from typing import Optional + + +class StoredFile(object): + def __init__(self, filename, filespecs): + self.filename = filename + self._filesize: Optional[int] = filespecs.get("filesize", None) + self._storedsize: Optional[int] = filespecs.get("storedsize", self._filesize) + self._cachedsince: Optional[int] = filespecs.get("cachedsince", None) + self._lastaccessed: Optional[int] = filespecs.get("lastaccessed", None) + self._numberofaccesses: int = filespecs.get("numberofaccesses", 0) + + @property + def filesize(self): + return self._filesize + + @property + def cachedsince(self): + return self._cachedsince + + @property + def lastaccessed(self): + return self._lastaccessed + + @property + def numberofaccesses(self): + return self._numberofaccesses + + @cachedsince.setter + def cachedsince(self, value: int): + self._cachedsince = value + + @lastaccessed.setter + def lastaccessed(self, value: int): + self._lastaccessed = value + + def increment_accesses(self): + self._numberofaccesses += 1 + + +class RequestedFile(object): + def __init__(self, filename, filespecs): + self.filename: str = filename + self._filesize: Optional[int] = filespecs.get("filesize", None) + # self._requestedsize: Optional[int] = filespecs.get("requestedsize", None) + # self._added: Optional[int] = filespecs.get("requestedsize", None) + + @property + def filesize(self): + return self._filesize + + def convert_to_stored_file(self, currenttime): + filespecs = dict( + filesize=self._filesize, + cachedsince=currenttime, + lastaccessed=currenttime, + numberofaccesses=1, + ) + print("convert file: ", filespecs) + return StoredFile(self.filename, filespecs) From 7b2e75087f120cc2c3e8c8b3883869e903e978bd Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Thu, 7 Nov 2019 14:34:51 +0100 Subject: [PATCH 358/648] moved caching algorithm and associated cache cleanup to it's own class --- lapis/cachealgorithm.py | 52 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) create mode 100644 lapis/cachealgorithm.py diff --git a/lapis/cachealgorithm.py b/lapis/cachealgorithm.py new file mode 100644 index 0000000..6064de5 --- /dev/null +++ b/lapis/cachealgorithm.py @@ -0,0 +1,52 @@ +from typing import Optional, Set +from lapis.files import RequestedFile +from lapis.utilities.cache_algorithm_implementations import cache_algorithm +from lapis.utilities.cache_cleanup_implementations import sort_files_by_cachedsince + + +class CacheAlgorithm(object): + def __init__(self, storage, additional_information: Optional[str] = None): + self._storage = storage + self._additional_information = additional_information + + def _file_based_consideration(self, candidate: RequestedFile) -> bool: + """ + File based caching decision: Checks if candidate file should be cached based on + conditions that apply to + file itself without considering the caches overall state. + :param candidate: + :return: + """ + if self._storage.storagesize > candidate.filesize: + return cache_algorithm["standard"](candidate) + else: + return False + + def _context_based_consideration(self, candidate: RequestedFile): + """ + Caching decision based on the the overall context + :param candidate: + :return: + """ + to_be_removed = set() + sorted_stored_files = sort_files_by_cachedsince(self._storage.files) + current_free_storage = self._storage.free_space() + for stored_file in sorted_stored_files: + if stored_file.numberofaccesses < 3: + to_be_removed.add(stored_file) + current_free_storage += stored_file.filesize + if current_free_storage >= candidate.filesize: + return to_be_removed + else: + continue + if current_free_storage >= candidate.filesize: + return {candidate} + + def consider(self, candidate: RequestedFile) -> Optional[Set[RequestedFile]]: + if self._file_based_consideration(candidate): + if self._storage.free_space() < candidate.filesize: + return self._context_based_consideration(candidate) + else: + return {} + else: + return {candidate} From bdfaee85482f052181c8607dd072b5e8a60faa36 Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Thu, 7 Nov 2019 14:37:44 +0100 Subject: [PATCH 359/648] Redesign of the storage class and associated changes --- lapis/file_provider.py | 12 +- lapis/job.py | 15 +- lapis/storage.py | 134 ++++++++++++------ lapis/storage_io/storage.py | 2 +- .../cache_algorithm_implementations.py | 2 +- .../cache_cleanup_implementations.py | 58 +++++--- 6 files changed, 145 insertions(+), 78 deletions(-) diff --git a/lapis/file_provider.py b/lapis/file_provider.py index ccbd417..b66e2bc 100644 --- a/lapis/file_provider.py +++ b/lapis/file_provider.py @@ -1,4 +1,5 @@ from lapis.storage import Storage +from lapis.files import RequestedFile from typing import Optional @@ -15,8 +16,8 @@ def add_storage_element(self, storage_element: Storage): except KeyError: self.storages[storage_element.sitename] = [storage_element] - def input_file_coverage( - self, dronesite: str, requested_files: Optional[dict] = None + async def input_file_coverage( + self, dronesite: str, requested_files: Optional[dict] = None, job_repr=None ) -> float: """ Dummy implementation, to be replaced @@ -25,15 +26,18 @@ def input_file_coverage( :param dronesite: :return: """ + print("FILEPROVIDER hit input file coverage") + provided_storages = self.storages.get(dronesite, None) if provided_storages: provides_inputfile = [] for inputfilename, inputfilespecs in requested_files.items(): provides_inputfile.append(0) for storage in provided_storages: - provides_inputfile[-1] += storage.provides_file( - (inputfilename, inputfilespecs) + provides_inputfile[-1] += await storage.providing_file( + RequestedFile(inputfilename, inputfilespecs), job_repr ) + return 1 - provided_storages.count(0) / len(provided_storages) else: return 0 diff --git a/lapis/job.py b/lapis/job.py index bf87805..c33b24d 100644 --- a/lapis/job.py +++ b/lapis/job.py @@ -93,9 +93,14 @@ def waiting_time(self) -> float: return float("Inf") @property - def walltime(self): - if self.drone.fileprovider and self.drone.fileprovider.input_file_coverage( - self.drone.sitename, self.used_inputfiles + async def walltime(self): + print("DEBUG JOB hit walltime") + # TODO: reimplement that usedsize != filesize and change back to used_inputfiles + if ( + self.drone.fileprovider + and await self.drone.fileprovider.input_file_coverage( + self.drone.sitename, self.requested_inputfiles, repr(self) + ) ): return walltime_models["maxeff"](self, self._walltime) else: @@ -112,7 +117,7 @@ async def run(self): ) ) try: - await (time + self.walltime) + await (time + await self.walltime) except CancelTask: self.drone = None self._success = False @@ -126,7 +131,7 @@ async def run(self): await sampling_required.put(self) def __repr__(self): - return "<%s: %s>" % (self.__class__.__name__, self._name or id(self)) + return "<%s: %s>" % (self.__class__.__name__, self._name or id(self) % 100) async def job_to_queue_scheduler(job_generator, job_queue): diff --git a/lapis/storage.py b/lapis/storage.py index 4c67c48..0147811 100644 --- a/lapis/storage.py +++ b/lapis/storage.py @@ -1,72 +1,118 @@ -from usim import time +from usim import time, Resources from typing import Optional -from lapis.utilities.cache_algorithm_implementations import cache_algorithm -from lapis.utilities.cache_cleanup_implementations import cache_cleanup +from lapis.files import StoredFile, RequestedFile +from lapis.cachealgorithm import CacheAlgorithm class Storage(object): - __slots__ = ("name", "sitename", "storagesize", "usedstorage", "content") + __slots__ = ( + "name", + "sitename", + "storagesize", + "placement_duration", + "deletion_duration", + "_usedstorage", + "files", + "filenames", + "cachealgorithm", + ) def __init__( - self, name: str, sitename: str, storagesize: int, content: Optional[dict] = None + self, name: str, sitename: str, storagesize: int, files: Optional[dict] = None ): self.name = name self.sitename = sitename + self.placement_duration = 10 + self.deletion_duration = 5 self.storagesize = storagesize - self.content = content - self.usedstorage = self._calculate_used_storage() + self.files = self._dict_to_file_object(files) + self.filenames = set(file.filename for file in self.files) + self._usedstorage = Resources(**dict(usedsize=self._initial_used_storage())) + self.cachealgorithm = CacheAlgorithm(self) self.__repr__() - print(self.sitename) - def _calculate_used_storage(self): - return sum(subdict["usedsize"] for subdict in self.content.values()) + def _initial_used_storage(self): + initial_value = sum(file.filesize for file in self.files) + print("{} set initial value {}".format(self.name, initial_value)) + return initial_value + + def _dict_to_file_object(self, files): + files_set = set() + for filename, filespecs in files.items(): + files_set.add(StoredFile(filename, filespecs)) + return files_set + + @property + def usedstorage(self): + return dict(self._usedstorage.levels)["usedsize"] def free_space(self): return self.storagesize - self.usedstorage - def add_file(self, filename: str, filespecs: tuple): - assert filename not in self.content.keys() - if self.free_space() - filespecs["usedsize"] < 0: - self.make_room(self.free_space() - filespecs["usedsize"]) - self.content[filename] = filespecs - self.content[filename].update( - cachedsince=time.now, lastaccessed=time.now, numberofaccesses=0 + def find_file(self, filename): + for file in self.files: + if file.filename == filename: + return file + + async def remove_from_storage(self, file: StoredFile, job_repr): + print( + "REMOVE FROM STORAGE: Job {}, File {} @ {}".format( + job_repr, file.filename, time.now + ) ) - self.usedstorage = self._calculate_used_storage() + await (time + self.deletion_duration) + await self._usedstorage.decrease(**{"usedsize": file.filesize}) + self.filenames.remove(file.filename) + self.files.remove(file) - def update_file(self, filerequest: tuple): - requested_file, filespecs = filerequest - filesize_difference = ( - filespecs["usedsize"] - self.content[requested_file]["usedsize"] + async def add_to_storage(self, file: RequestedFile, job_repr): + print( + "ADD TO STORAGE: Job {}, File {} @ {}".format( + job_repr, file.filename, time.now + ) ) - if filesize_difference > 0: - self.make_room(filesize_difference) - self.content[requested_file]["usedsize"] += filesize_difference - self.content[requested_file]["lastaccessed"] = time.now - self.content[requested_file]["numberofaccesses"] += 1 - self.usedstorage = self._calculate_used_storage() - - def make_room(self, filesize_difference: int): - if self.free_space() - filesize_difference < 0: - cache_cleanup["fifo"](filesize_difference, self) - - def provides_file(self, filerequest: dict): - filename, filespecs = filerequest - if filename in self.content.keys(): - self.update_file(filerequest) + file = file.convert_to_stored_file(time.now) + await (time + self.placement_duration) + await self._usedstorage.increase(**{"usedsize": file.filesize}) + self.filenames.add(file.filename) + self.files.add(file) + + def update_file(self, stored_file: StoredFile, job_repr): + print("UPDATE: Job {}, File {}".format(job_repr, stored_file.filename)) + stored_file.lastaccessed = time.now + stored_file.increment_accesses() + + async def apply_caching_decision(self, requested_file: RequestedFile, job_repr): + print( + "APPLY CACHING DECISION: Job {}, File {} @ {}".format( + job_repr, requested_file.filename, time.now + ) + ) + to_be_removed = self.cachealgorithm.consider(requested_file) + if not to_be_removed: + await self.add_to_storage(requested_file, job_repr) + elif to_be_removed == {requested_file}: + # file will not be cached because it either does not match + # conditions or because there is no space in the cache + print( + "APPLY CACHING DECISION: Job {}, File {}: File wasnt " + "cached @ {}".format(job_repr, requested_file.filename, time.now) + ) + else: + for file in to_be_removed: + await self.remove_from_storage(file, job_repr) + + async def providing_file(self, requested_file: RequestedFile, job_repr): + if requested_file.filename in self.filenames: + self.update_file(self.find_file(requested_file.filename), job_repr) return True else: - if self.cache_file(): - self.add_file(filename, filespecs) + await self.apply_caching_decision(requested_file, job_repr) return False - def cache_file(self): - # cache everything, test different implementations - return cache_algorithm["standard"]() - def __repr__(self): return ( "{name} on site {site}: {used}MB of {tot}MB used ({div} %), contains " @@ -76,6 +122,6 @@ def __repr__(self): used=self.usedstorage, tot=self.storagesize, div=100.0 * self.usedstorage / self.storagesize, - filelist=", ".join(self.content.keys()), + filelist=", ".join(self.filenames), ) ) diff --git a/lapis/storage_io/storage.py b/lapis/storage_io/storage.py index 5829816..88a0ed6 100644 --- a/lapis/storage_io/storage.py +++ b/lapis/storage_io/storage.py @@ -10,7 +10,7 @@ def storage_reader(storage, storage_content): name=row["name"], sitename=row["sitename"], storagesize=int(row["cachesizeMB"]), - content=storage_content[row["name"]], + files=storage_content[row["name"]], ) diff --git a/lapis/utilities/cache_algorithm_implementations.py b/lapis/utilities/cache_algorithm_implementations.py index 23b1ff0..468ed0d 100644 --- a/lapis/utilities/cache_algorithm_implementations.py +++ b/lapis/utilities/cache_algorithm_implementations.py @@ -1,4 +1,4 @@ -def cache_all(): +def cache_all(*args, **kwargs): return True diff --git a/lapis/utilities/cache_cleanup_implementations.py b/lapis/utilities/cache_cleanup_implementations.py index 90c4e6e..19da640 100644 --- a/lapis/utilities/cache_cleanup_implementations.py +++ b/lapis/utilities/cache_cleanup_implementations.py @@ -1,25 +1,37 @@ -def fifo(size, storage): - # FIFO, test different implementations - sorted_content = sorted( - list(storage.content.items()), key=lambda x: x[1]["cachedsince"] - ) - while size < 0: - size += sorted_content[0][1]["cachedsizeMB"] - storage.content.pop(sorted_content[0][0]) - storage.usedstorage -= sorted_content[0][1]["cachedsizeMB"] - sorted_content.pop(0) +def sort_files_by_cachedsince(stored_files: set): + return sorted(list(stored_files), key=lambda x: x.cachedsince) -def last_accessed(size, storage): - # FIFO, test different implementations - sorted_content = sorted( - list(storage.content.items()), key=lambda x: x[1]["lastaccessed"] - ) - while size < 0: - size += sorted_content[0][1]["cachedsizeMB"] - storage.content.pop(sorted_content[0][0]) - storage.usedstorage -= sorted_content[0][1]["cachedsizeMB"] - sorted_content.pop(0) - - -cache_cleanup = {"fifo": fifo, "lastaccessed": last_accessed} +# async def fifo(size, storage): +# print("hit fifo") +# print(storage.files.keys()) +# # FIFO, test different implementations +# sorted_content = sorted( +# list(storage.files.items()), key=lambda x: x.filespecs.cachedsince +# ) +# print("sorted", sorted_content) +# while size < 0: +# print("hit while") +# size += sorted_content[0][1]["cachedsizeMB"] +# storage.files.pop(sorted_content[0][0]) +# await sleep(storage.placement_duration) +# await storage._usedstorage.decrease( +# **{"usedsize": sorted_content[0][1]["cachedsizeMB"]}) +# print(storage.usedstorage) +# sorted_content.pop(0) +# print("after fifo ", storage.files.keys()) +# +# +# def last_accessed(size, storage): +# # FIFO, test different implementations +# sorted_content = sorted( +# list(storage.content.items()), key=lambda x: x[1]["lastaccessed"] +# ) +# while size < 0: +# size += sorted_content[0][1]["cachedsizeMB"] +# storage.content.pop(sorted_content[0][0]) +# storage.usedstorage -= sorted_content[0][1]["cachedsizeMB"] +# sorted_content.pop(0) +# +# +# cache_cleanup = {"fifo": fifo, "lastaccessed": last_accessed} From 30418d3703a95f7ed6f346cb1d5370cba4b24813 Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Thu, 7 Nov 2019 14:46:45 +0100 Subject: [PATCH 360/648] put walltime getter and walltime recalculation back in seperate methods --- lapis/job.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/lapis/job.py b/lapis/job.py index c33b24d..bb3d428 100644 --- a/lapis/job.py +++ b/lapis/job.py @@ -93,18 +93,17 @@ def waiting_time(self) -> float: return float("Inf") @property - async def walltime(self): - print("DEBUG JOB hit walltime") - # TODO: reimplement that usedsize != filesize and change back to used_inputfiles - if ( - self.drone.fileprovider - and await self.drone.fileprovider.input_file_coverage( - self.drone.sitename, self.requested_inputfiles, repr(self) - ) + def walltime(self) -> float: + """ + :return: Time that passes while job is running + """ + return self._walltime + + def recalculate_walltime(self): + if self.drone.fileprovider and self.drone.fileprovider.input_file_coverage( + self.drone.sitename, self.used_inputfiles ): - return walltime_models["maxeff"](self, self._walltime) - else: - return self._walltime + self._walltime = walltime_models["maxeff"](self, self._walltime) async def run(self): self.in_queue_until = time.now @@ -117,7 +116,8 @@ async def run(self): ) ) try: - await (time + await self.walltime) + self.recalculate_walltime() + await (time + self._walltime) except CancelTask: self.drone = None self._success = False From a9966c437fff0578f7d00e7ce53c1be6d837257f Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Thu, 7 Nov 2019 19:59:03 +0100 Subject: [PATCH 361/648] added parallel treatment of jobs input files in file provider --- lapis/file_provider.py | 56 ++++++++++++++++++++++++++++++++++++------ lapis/files.py | 4 +-- 2 files changed, 49 insertions(+), 11 deletions(-) diff --git a/lapis/file_provider.py b/lapis/file_provider.py index b66e2bc..51133b9 100644 --- a/lapis/file_provider.py +++ b/lapis/file_provider.py @@ -1,6 +1,7 @@ from lapis.storage import Storage from lapis.files import RequestedFile from typing import Optional +from usim import Queue, Scope class FileProvider(object): @@ -11,6 +12,11 @@ def __init__(self): self.storages = dict() def add_storage_element(self, storage_element: Storage): + """ + Register storage element in FileProvider clustering storage elements by sitename + :param storage_element: + :return: + """ try: self.storages[storage_element.sitename].append(storage_element) except KeyError: @@ -30,14 +36,48 @@ async def input_file_coverage( provided_storages = self.storages.get(dronesite, None) if provided_storages: - provides_inputfile = [] - for inputfilename, inputfilespecs in requested_files.items(): - provides_inputfile.append(0) - for storage in provided_storages: - provides_inputfile[-1] += await storage.providing_file( - RequestedFile(inputfilename, inputfilespecs), job_repr + score_queue = Queue() + async with Scope() as scope: + for inputfilename, inputfilespecs in requested_files.items(): + scope.do( + self.look_file_up_in_storage( + RequestedFile(inputfilename, inputfilespecs), + provided_storages, + job_repr, + score_queue, + ) ) - - return 1 - provided_storages.count(0) / len(provided_storages) + await score_queue.close() + total_score = await self.calculate_score(score_queue) + return total_score / len(provided_storages) else: return 0 + + async def look_file_up_in_storage( + self, requested_file: RequestedFile, available_storages: list, job_repr, q + ): + """ + Calculates how many storages provide the requested file, puts result in queue + for readout. + :param requested_file: + :param available_storages: + :param job_repr: + :param q: + :return: + """ + file_score = sum( + [ + await storage.providing_file(requested_file, job_repr) + for storage in available_storages + ] + ) + await q.put({requested_file.filename: file_score}) + + async def calculate_score(self, queue: Queue): + """ + Reads each input files individual score from queue and returns number of input + files that are provided by a storage element. + :param queue: + :return: + """ + return sum([1 async for element in queue if list(element.values())[0] > 0]) diff --git a/lapis/files.py b/lapis/files.py index 0bb8cd7..65960f4 100644 --- a/lapis/files.py +++ b/lapis/files.py @@ -39,11 +39,9 @@ def increment_accesses(self): class RequestedFile(object): - def __init__(self, filename, filespecs): + def __init__(self, filename: str, filespecs: dict): self.filename: str = filename self._filesize: Optional[int] = filespecs.get("filesize", None) - # self._requestedsize: Optional[int] = filespecs.get("requestedsize", None) - # self._added: Optional[int] = filespecs.get("requestedsize", None) @property def filesize(self): From 345e7328acac43f228f775deb55740a4d24e2dcc Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Thu, 7 Nov 2019 20:08:33 +0100 Subject: [PATCH 362/648] fixed failed unit test that were caused by Drone without file provider, one unit test fails because job has no drone, ticket already exists --- lapis/drone.py | 11 +++++++---- lapis_tests/test_job.py | 5 ++++- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/lapis/drone.py b/lapis/drone.py index 5e7541f..388b301 100644 --- a/lapis/drone.py +++ b/lapis/drone.py @@ -1,7 +1,10 @@ from cobald import interfaces -from usim import time, Scope, instant, Capacities, ResourcesUnavailable, Queue + +from usim import time, Scope, instant, Capacities, ResourcesUnavailable +from typing import Optional from lapis.job import Job +from lapis.file_provider import FileProvider class ResourcesExceeded(Exception): @@ -12,9 +15,9 @@ class Drone(interfaces.Pool): def __init__( self, scheduler, - fileprovider, - pool_resources: dict, - scheduling_duration: float, + fileprovider: FileProvider = FileProvider(), + pool_resources: Optional[dict] = None, + scheduling_duration: Optional[float] = None, ignore_resources: list = None, sitename: str = None, ): diff --git a/lapis_tests/test_job.py b/lapis_tests/test_job.py index 181bb1a..208581c 100644 --- a/lapis_tests/test_job.py +++ b/lapis_tests/test_job.py @@ -3,7 +3,9 @@ from lapis.drone import Drone from lapis.job import Job -from lapis_tests import via_usim, DummyScheduler, DummyDrone + +from lapis.file_provider import FileProvider +from lapis_tests import via_usim, DummyScheduler class TestJob(object): @@ -47,6 +49,7 @@ async def test_job_in_drone(self): scheduler=scheduler, pool_resources={"cores": 1, "memory": 1}, scheduling_duration=0, + fileprovider=FileProvider(), ) await drone.run() async with Scope() as scope: From 8a776ec6571291814ad7fb597d91c61e0f50dff1 Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Fri, 8 Nov 2019 09:35:44 +0100 Subject: [PATCH 363/648] changed scoring to take filesizes into consideration --- lapis/file_provider.py | 35 +++++++++++++++++++++++++++-------- lapis/files.py | 1 - lapis/job.py | 17 ++++++++--------- lapis/scheduler.py | 3 ++- lapis/storage.py | 11 ++++++++--- 5 files changed, 45 insertions(+), 22 deletions(-) diff --git a/lapis/file_provider.py b/lapis/file_provider.py index 51133b9..d2bc961 100644 --- a/lapis/file_provider.py +++ b/lapis/file_provider.py @@ -1,7 +1,7 @@ from lapis.storage import Storage from lapis.files import RequestedFile from typing import Optional -from usim import Queue, Scope +from usim import Queue, Scope, time class FileProvider(object): @@ -48,8 +48,16 @@ async def input_file_coverage( ) ) await score_queue.close() - total_score = await self.calculate_score(score_queue) - return total_score / len(provided_storages) + cached_size = await self.calculate_score(score_queue) + total_size = float( + sum( + [ + inputfilespecs["filesize"] + for _, inputfilespecs in requested_files.items() + ] + ) + ) + return cached_size / total_size else: return 0 @@ -65,13 +73,18 @@ async def look_file_up_in_storage( :param q: :return: """ - file_score = sum( + print( + "LOOK UP: Job {}, File {} @ {}".format( + job_repr, requested_file.filename, time.now + ) + ) + file_score = sorted( [ - await storage.providing_file(requested_file, job_repr) + int(await storage.providing_file(requested_file, job_repr)) for storage in available_storages ] - ) - await q.put({requested_file.filename: file_score}) + )[0] + await q.put({requested_file: file_score}) async def calculate_score(self, queue: Queue): """ @@ -80,4 +93,10 @@ async def calculate_score(self, queue: Queue): :param queue: :return: """ - return sum([1 async for element in queue if list(element.values())[0] > 0]) + return sum( + [ + list(element.keys())[0].filesize + async for element in queue + if list(element.values())[0] > 0 + ] + ) diff --git a/lapis/files.py b/lapis/files.py index 65960f4..15934bb 100644 --- a/lapis/files.py +++ b/lapis/files.py @@ -54,5 +54,4 @@ def convert_to_stored_file(self, currenttime): lastaccessed=currenttime, numberofaccesses=1, ) - print("convert file: ", filespecs) return StoredFile(self.filename, filespecs) diff --git a/lapis/job.py b/lapis/job.py index bb3d428..73645b8 100644 --- a/lapis/job.py +++ b/lapis/job.py @@ -93,15 +93,14 @@ def waiting_time(self) -> float: return float("Inf") @property - def walltime(self) -> float: - """ - :return: Time that passes while job is running - """ - return self._walltime - - def recalculate_walltime(self): - if self.drone.fileprovider and self.drone.fileprovider.input_file_coverage( - self.drone.sitename, self.used_inputfiles + async def walltime(self): + print("WALLTIME: Job {}".format(repr(self))) + # TODO: reimplement that usedsize != filesize and change back to used_inputfiles + if ( + self.drone.fileprovider + and await self.drone.fileprovider.input_file_coverage( + self.drone.sitename, self.requested_inputfiles, repr(self) + ) ): self._walltime = walltime_models["maxeff"](self, self._walltime) diff --git a/lapis/scheduler.py b/lapis/scheduler.py index d20ad20..0b4c33d 100644 --- a/lapis/scheduler.py +++ b/lapis/scheduler.py @@ -88,7 +88,8 @@ async def run(self): async with Scope() as scope: scope.do(self._collect_jobs()) async for _ in interval(self.interval): - for job in self.job_queue.copy(): + print("NEW SCHEDULING INTERVAL @ {}".format(time.now)) + for job in self.job_queue: best_match = self._schedule_job(job) if best_match: print( diff --git a/lapis/storage.py b/lapis/storage.py index 0147811..4cacfd4 100644 --- a/lapis/storage.py +++ b/lapis/storage.py @@ -80,10 +80,15 @@ async def add_to_storage(self, file: RequestedFile, job_repr): self.filenames.add(file.filename) self.files.add(file) - def update_file(self, stored_file: StoredFile, job_repr): - print("UPDATE: Job {}, File {}".format(job_repr, stored_file.filename)) + async def update_file(self, stored_file: StoredFile, job_repr): + await (time + 1) stored_file.lastaccessed = time.now stored_file.increment_accesses() + print( + "UPDATE: Job {}, File {} @ {}".format( + job_repr, stored_file.filename, time.now + ) + ) async def apply_caching_decision(self, requested_file: RequestedFile, job_repr): print( @@ -107,7 +112,7 @@ async def apply_caching_decision(self, requested_file: RequestedFile, job_repr): async def providing_file(self, requested_file: RequestedFile, job_repr): if requested_file.filename in self.filenames: - self.update_file(self.find_file(requested_file.filename), job_repr) + await self.update_file(self.find_file(requested_file.filename), job_repr) return True else: await self.apply_caching_decision(requested_file, job_repr) From 787d6ccb98ff763ae26dd5819d36fcc153ac3ef8 Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Fri, 8 Nov 2019 09:55:59 +0100 Subject: [PATCH 364/648] fixed bug from merge --- lapis/job.py | 1 - 1 file changed, 1 deletion(-) diff --git a/lapis/job.py b/lapis/job.py index 73645b8..caa399c 100644 --- a/lapis/job.py +++ b/lapis/job.py @@ -95,7 +95,6 @@ def waiting_time(self) -> float: @property async def walltime(self): print("WALLTIME: Job {}".format(repr(self))) - # TODO: reimplement that usedsize != filesize and change back to used_inputfiles if ( self.drone.fileprovider and await self.drone.fileprovider.input_file_coverage( From 02b1c3b06d3c9bde607bb262e1971bc913abe94f Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Fri, 8 Nov 2019 10:13:47 +0100 Subject: [PATCH 365/648] removed debug output to fix unit test --- lapis/job.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapis/job.py b/lapis/job.py index caa399c..19bff33 100644 --- a/lapis/job.py +++ b/lapis/job.py @@ -129,7 +129,7 @@ async def run(self): await sampling_required.put(self) def __repr__(self): - return "<%s: %s>" % (self.__class__.__name__, self._name or id(self) % 100) + return "<%s: %s>" % (self.__class__.__name__, self._name or id(self)) async def job_to_queue_scheduler(job_generator, job_queue): From 8ceb544252236d73543ae4751b272ca8ab88c79b Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Mon, 11 Nov 2019 20:36:56 +0100 Subject: [PATCH 366/648] First steps towards including everything concerning caching into monitoring --- lapis/monitor/general.py | 28 ++++++++++++++++++++++++++++ lapis/simulator.py | 2 ++ 2 files changed, 30 insertions(+) diff --git a/lapis/monitor/general.py b/lapis/monitor/general.py index be6d24d..e11e3f5 100644 --- a/lapis/monitor/general.py +++ b/lapis/monitor/general.py @@ -10,6 +10,7 @@ from lapis.monitor import LoggingSocketHandler, LoggingUDPSocketHandler from lapis.pool import Pool from lapis.scheduler import CondorJobScheduler, JobQueue +from lapis.storage import Storage if TYPE_CHECKING: from lapis.simulator import Simulator @@ -215,3 +216,30 @@ def configuration_information(simulator: "Simulator") -> List[Dict]: tags={"tardis", "pool_configuration", "resource_type"}, resolution=1 ), } + + +def storage_status(storage: Storage): + """ + Log information about current storage object state + :param storage: + :return: list of records for logging + """ + results = [ + { + "usedstorage": storage.usedstorage, + "storagesize": storage.storagesize, + "numberoffiles": len(storage.filenames), + } + ] + return results + + +storage_status.name = "storage_status" +storage_status.whitelist = (Storage,) +storage_status.logging_formatter = { + LoggingSocketHandler.__name__: JsonFormatter(), + logging.StreamHandler.__name__: JsonFormatter(), + LoggingUDPSocketHandler.__name__: LineProtocolFormatter( + tags={"tardis"}, resolution=1 + ), +} diff --git a/lapis/simulator.py b/lapis/simulator.py index 83bc653..00a8056 100644 --- a/lapis/simulator.py +++ b/lapis/simulator.py @@ -14,6 +14,7 @@ pool_status, configuration_information, job_events, + storage_status, ) from lapis.monitor import Monitoring from lapis.monitor.cobald import drone_statistics, pool_statistics @@ -47,6 +48,7 @@ def enable_monitoring(self): self.monitoring.register_statistic(resource_statistics) self.monitoring.register_statistic(pool_status) self.monitoring.register_statistic(configuration_information) + self.monitoring.register_statistic(storage_status) def create_job_generator(self, job_input, job_reader): self._job_generators.append((job_input, job_reader)) From 8af5722836b816ea0f10ac9f84d0617f59bb09f1 Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Mon, 11 Nov 2019 20:38:43 +0100 Subject: [PATCH 367/648] renamed method --- lapis/files.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapis/files.py b/lapis/files.py index 15934bb..68727c1 100644 --- a/lapis/files.py +++ b/lapis/files.py @@ -47,7 +47,7 @@ def __init__(self, filename: str, filespecs: dict): def filesize(self): return self._filesize - def convert_to_stored_file(self, currenttime): + def convert_to_stored_file_object(self, currenttime): filespecs = dict( filesize=self._filesize, cachedsince=currenttime, From e54607c189cffb4ba235c6640b309a30172967a6 Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Mon, 11 Nov 2019 20:42:44 +0100 Subject: [PATCH 368/648] split processing of job into file transfer and actual calculation --- lapis/job.py | 33 +++++++++++++++++++++++++-------- 1 file changed, 25 insertions(+), 8 deletions(-) diff --git a/lapis/job.py b/lapis/job.py index 19bff33..5d4976b 100644 --- a/lapis/job.py +++ b/lapis/job.py @@ -93,15 +93,27 @@ def waiting_time(self) -> float: return float("Inf") @property - async def walltime(self): - print("WALLTIME: Job {}".format(repr(self))) - if ( - self.drone.fileprovider - and await self.drone.fileprovider.input_file_coverage( - self.drone.sitename, self.requested_inputfiles, repr(self) + def walltime(self) -> float: + """ + :return: Time that passes while job is running + """ + return self._walltime + + def calculation_time(self): + print("WALLTIME: Job {} @ {}".format(repr(self), time.now)) + return walltime_models["maxeff"](self, self._walltime) + + async def transfer_inputfiles(self): + print("TRANSFERING INPUTFILES: Job {} @ {}".format(repr(self), time.now)) + if self.drone.fileprovider and self.used_inputfiles: + stream_time = await self.drone.fileprovider.transfer_inputfiles( + self.drone, self.requested_inputfiles, repr(self) + ) + print( + "streamed inputfiles {} for job {} in {} timeunits, finished @ {}".format( + self.requested_inputfiles.keys(), repr(self), stream_time, time.now ) - ): - self._walltime = walltime_models["maxeff"](self, self._walltime) + ) async def run(self): self.in_queue_until = time.now @@ -114,8 +126,13 @@ async def run(self): ) ) try: +<<<<<<< HEAD self.recalculate_walltime() await (time + self._walltime) +======= + await self.transfer_inputfiles() + await (time + self.calculation_time()) +>>>>>>> split processing of job into file transfer and actual calculation except CancelTask: self.drone = None self._success = False From 1c35bee208d9aa16ce030a0d5d3fd81464469b59 Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Mon, 11 Nov 2019 20:46:31 +0100 Subject: [PATCH 369/648] refactored storage and file provider objects in order to use Pipe --- lapis/file_provider.py | 125 +++++++++++++++++++------------------ lapis/storage.py | 137 +++++++++++++++++++++++++++++++++-------- 2 files changed, 179 insertions(+), 83 deletions(-) diff --git a/lapis/file_provider.py b/lapis/file_provider.py index d2bc961..768bbbc 100644 --- a/lapis/file_provider.py +++ b/lapis/file_provider.py @@ -1,15 +1,16 @@ from lapis.storage import Storage from lapis.files import RequestedFile -from typing import Optional -from usim import Queue, Scope, time +from usim import Queue, Scope, time, Pipe +import random class FileProvider(object): - __slots__ = ("storages",) + __slots__ = ("storages", "remote_connection") - def __init__(self): + def __init__(self, throughput=20): self.storages = dict() + self.remote_connection = Pipe(throughput=throughput) def add_storage_element(self, storage_element: Storage): """ @@ -22,81 +23,87 @@ def add_storage_element(self, storage_element: Storage): except KeyError: self.storages[storage_element.sitename] = [storage_element] - async def input_file_coverage( - self, dronesite: str, requested_files: Optional[dict] = None, job_repr=None - ) -> float: + async def determine_inputfile_source( + self, requested_file: RequestedFile, dronesite: str, job_repr: str + ): """ - Dummy implementation, to be replaced - - :param requested_files: + Collects NamedTuples containing the amount of data of the requested file + cached in a storage element and the storage element for all reachable storage + objects on the drone's site. The tuples are sorted by amount of cached data + and the storage object where the biggest part of the file is cached is + returned. If the file is not cached in any storage object the fileproviders + remote connection is returned. + :param requested_file: :param dronesite: + :param job_repr: :return: """ - print("FILEPROVIDER hit input file coverage") - provided_storages = self.storages.get(dronesite, None) if provided_storages: - score_queue = Queue() + look_up_queue = Queue() async with Scope() as scope: - for inputfilename, inputfilespecs in requested_files.items(): + for storage in provided_storages: scope.do( - self.look_file_up_in_storage( - RequestedFile(inputfilename, inputfilespecs), - provided_storages, - job_repr, - score_queue, - ) + storage.look_up_file(requested_file, look_up_queue, job_repr) ) - await score_queue.close() - cached_size = await self.calculate_score(score_queue) - total_size = float( - sum( - [ - inputfilespecs["filesize"] - for _, inputfilespecs in requested_files.items() - ] - ) + await look_up_queue.close() + storage_list = sorted( + [entry async for entry in look_up_queue], + key=lambda x: x[0], + reverse=True, ) - return cached_size / total_size + if storage_list[0].cached_filesize > 0: + return storage_list[0].storage + else: + return self.remote_connection else: - return 0 + return self.remote_connection - async def look_file_up_in_storage( - self, requested_file: RequestedFile, available_storages: list, job_repr, q - ): + async def stream_file(self, requested_file: RequestedFile, dronesite, job_repr): """ - Calculates how many storages provide the requested file, puts result in queue - for readout. + Determines which storage object is used to provide the requested file and + startes the files transfer. For files transfered via remote connection a + potential cache decides whether to cache the file and handles the caching + process. :param requested_file: - :param available_storages: + :param dronesite: :param job_repr: - :param q: :return: """ - print( - "LOOK UP: Job {}, File {} @ {}".format( - job_repr, requested_file.filename, time.now - ) + used_connection = await self.determine_inputfile_source( + requested_file, dronesite, job_repr ) - file_score = sorted( - [ - int(await storage.providing_file(requested_file, job_repr)) - for storage in available_storages - ] - )[0] - await q.put({requested_file: file_score}) + print(used_connection) + if used_connection == self.remote_connection: + potential_cache = random.choice(self.storages.get(dronesite, None)) + await used_connection.transfer(requested_file.filesize) + await potential_cache.apply_caching_decision(requested_file, job_repr) - async def calculate_score(self, queue: Queue): + else: + print("now transfering", requested_file.filesize) + await used_connection.transfer(requested_file, job_repr) + print( + "Job {}: finished transfering of file {}: {}GB @ {}".format( + job_repr, requested_file.filename, requested_file.filesize, time.now + ) + ) + + async def transfer_inputfiles(self, drone, requested_files: dict, job_repr): """ - Reads each input files individual score from queue and returns number of input - files that are provided by a storage element. - :param queue: + Converts dict information about requested files to RequestedFile object and + parallely launches streaming for all files + :param drone: + :param requested_files: + :param job_repr: :return: """ - return sum( - [ - list(element.keys())[0].filesize - async for element in queue - if list(element.values())[0] > 0 - ] + start_time = time.now + async with Scope() as scope: + for inputfilename, inputfilespecs in requested_files.items(): + requested_file = RequestedFile(inputfilename, inputfilespecs) + scope.do(self.stream_file(requested_file, drone.sitename, job_repr)) + stream_time = time.now - start_time + print( + "STREAMED files {} in {}".format(list(requested_files.keys()), stream_time) ) + return stream_time diff --git a/lapis/storage.py b/lapis/storage.py index 4cacfd4..02a3d0a 100644 --- a/lapis/storage.py +++ b/lapis/storage.py @@ -1,6 +1,6 @@ -from usim import time, Resources +from usim import time, Resources, Pipe, Queue -from typing import Optional +from typing import Optional, NamedTuple from lapis.files import StoredFile, RequestedFile from lapis.cachealgorithm import CacheAlgorithm @@ -12,26 +12,33 @@ class Storage(object): "name", "sitename", "storagesize", - "placement_duration", "deletion_duration", + "update_duration", "_usedstorage", "files", "filenames", "cachealgorithm", + "connection", ) def __init__( - self, name: str, sitename: str, storagesize: int, files: Optional[dict] = None + self, + name: str, + sitename: str, + storagesize: int, + throughput_limit: int = 1, + files: Optional[dict] = None, ): self.name = name self.sitename = sitename - self.placement_duration = 10 self.deletion_duration = 5 + self.update_duration = 1 self.storagesize = storagesize self.files = self._dict_to_file_object(files) self.filenames = set(file.filename for file in self.files) self._usedstorage = Resources(**dict(usedsize=self._initial_used_storage())) self.cachealgorithm = CacheAlgorithm(self) + self.connection = Pipe(throughput_limit) self.__repr__() def _initial_used_storage(self): @@ -53,11 +60,23 @@ def free_space(self): return self.storagesize - self.usedstorage def find_file(self, filename): + """ + Searches storage object for file with passed filename + :param filename: + :return: + """ for file in self.files: if file.filename == filename: return file async def remove_from_storage(self, file: StoredFile, job_repr): + """ + Deletes file from storage object. The time this operation takes is defined + by the storages deletion_duration attribute. + :param file: + :param job_repr: Needed for debug output, will be replaced + :return: + """ print( "REMOVE FROM STORAGE: Job {}, File {} @ {}".format( job_repr, file.filename, time.now @@ -69,19 +88,36 @@ async def remove_from_storage(self, file: StoredFile, job_repr): self.files.remove(file) async def add_to_storage(self, file: RequestedFile, job_repr): + """ + Adds file to storage object transfering it through the storage objects + connection. This should be sufficient for now because files are only added + to the storage when they are also transfered through the FileProviders remote + connection. If this simulator is extended to include any kind of + direct file placement this has to be adapted. + :param file: + :param job_repr: Needed for debug output, will be replaced + :return: + """ print( "ADD TO STORAGE: Job {}, File {} @ {}".format( job_repr, file.filename, time.now ) ) - file = file.convert_to_stored_file(time.now) - await (time + self.placement_duration) + file = file.convert_to_stored_file_object(time.now) + print(file.filesize) + await self.connection.transfer(file.filesize) await self._usedstorage.increase(**{"usedsize": file.filesize}) self.filenames.add(file.filename) self.files.add(file) async def update_file(self, stored_file: StoredFile, job_repr): - await (time + 1) + """ + Updates a stored files information upon access. + :param stored_file: + :param job_repr: Needed for debug output, will be replaced + :return: + """ + await (time + self.update_duration) stored_file.lastaccessed = time.now stored_file.increment_accesses() print( @@ -90,7 +126,30 @@ async def update_file(self, stored_file: StoredFile, job_repr): ) ) + async def transfer(self, file, job_repr): + """ + Manages file transfer via the storage elements connection and updates file + information. If the file should have been deleted since it was originally + looked up the resulting error is not raised. + :param file: + :param job_repr: Needed for debug output, will be replaced + :return: + """ + await self.connection.transfer(file.filesize) + try: + await self.update_file(self.find_file(file.filename), job_repr) + except AttributeError: + pass + async def apply_caching_decision(self, requested_file: RequestedFile, job_repr): + """ + Applies the storage objects caching algorithm to the requested_file and + initiates resulting changes like placement and deletion of files + :param requested_file: + :param job_repr: Needed for debug output, will be replaced + :return: + """ + print( "APPLY CACHING DECISION: Job {}, File {} @ {}".format( job_repr, requested_file.filename, time.now @@ -110,23 +169,53 @@ async def apply_caching_decision(self, requested_file: RequestedFile, job_repr): for file in to_be_removed: await self.remove_from_storage(file, job_repr) - async def providing_file(self, requested_file: RequestedFile, job_repr): - if requested_file.filename in self.filenames: - await self.update_file(self.find_file(requested_file.filename), job_repr) - return True + async def look_up_file(self, requested_file: RequestedFile, queue: Queue, job_repr): + """ + Searches storage object for the requested_file and sends result (amount of + cached data, storage object) to queue if queue was passed as parameter. + If no queue was passed the result is returned normally. This might be needed + when looking up files during coordination and is to be removed if it's not + necessary. + :param requested_file: + :param queue: + :param job_repr: Needed for debug output, will be replaced + :return: (amount of cached data, storage object) + """ + print( + "LOOK UP FILE: Job {}, File {}, Storage {} @ {}".format( + job_repr, requested_file.filename, repr(self), time.now + ) + ) + + class LookUpInformation(NamedTuple): + cached_filesize: int + storage: Storage + + if queue: + try: + print(self.find_file(requested_file.filename).filesize) + await queue.put( + LookUpInformation( + self.find_file(requested_file.filename).filesize, self + ) + ) + except AttributeError: + print( + "File {} not cached on any reachable storage".format( + requested_file.filename + ) + ) + await queue.put(LookUpInformation(0, self)) else: - await self.apply_caching_decision(requested_file, job_repr) - return False + return LookUpInformation( + self.find_file(requested_file.filename).filesize, self + ) def __repr__(self): - return ( - "{name} on site {site}: {used}MB of {tot}MB used ({div} %), contains " - "files {filelist}".format( - name=self.name, - site=self.sitename, - used=self.usedstorage, - tot=self.storagesize, - div=100.0 * self.usedstorage / self.storagesize, - filelist=", ".join(self.filenames), - ) + return "{name} on site {site}: {used}MB of {tot}MB used ({div} %)".format( + name=self.name, + site=self.sitename, + used=self.usedstorage, + tot=self.storagesize, + div=100.0 * self.usedstorage / self.storagesize, ) From 89bedecba7a1234e100c97484084e452602c64bf Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Tue, 12 Nov 2019 09:56:58 +0100 Subject: [PATCH 370/648] added monitoring for remote and storage connections --- lapis/file_provider.py | 7 ++++- lapis/monitor/general.py | 63 ++++++++++++++++++++++++++++++++++++++-- lapis/simulator.py | 4 +++ lapis/storage.py | 16 +++++----- 4 files changed, 80 insertions(+), 10 deletions(-) diff --git a/lapis/file_provider.py b/lapis/file_provider.py index 768bbbc..003f3b6 100644 --- a/lapis/file_provider.py +++ b/lapis/file_provider.py @@ -1,5 +1,7 @@ from lapis.storage import Storage from lapis.files import RequestedFile +from lapis.monitor import sampling_required + from usim import Queue, Scope, time, Pipe import random @@ -73,13 +75,16 @@ async def stream_file(self, requested_file: RequestedFile, dronesite, job_repr): used_connection = await self.determine_inputfile_source( requested_file, dronesite, job_repr ) - print(used_connection) + if used_connection == self.remote_connection: + await sampling_required.put(used_connection) potential_cache = random.choice(self.storages.get(dronesite, None)) await used_connection.transfer(requested_file.filesize) + await potential_cache.apply_caching_decision(requested_file, job_repr) else: + await sampling_required.put(used_connection) print("now transfering", requested_file.filesize) await used_connection.transfer(requested_file, job_repr) print( diff --git a/lapis/monitor/general.py b/lapis/monitor/general.py index e11e3f5..30f3f21 100644 --- a/lapis/monitor/general.py +++ b/lapis/monitor/general.py @@ -12,6 +12,8 @@ from lapis.scheduler import CondorJobScheduler, JobQueue from lapis.storage import Storage +from usim import Pipe + if TYPE_CHECKING: from lapis.simulator import Simulator @@ -218,7 +220,7 @@ def configuration_information(simulator: "Simulator") -> List[Dict]: } -def storage_status(storage: Storage): +def storage_status(storage: Storage) -> list: """ Log information about current storage object state :param storage: @@ -226,6 +228,7 @@ def storage_status(storage: Storage): """ results = [ { + "storage": repr(storage), "usedstorage": storage.usedstorage, "storagesize": storage.storagesize, "numberoffiles": len(storage.filenames), @@ -240,6 +243,62 @@ def storage_status(storage: Storage): LoggingSocketHandler.__name__: JsonFormatter(), logging.StreamHandler.__name__: JsonFormatter(), LoggingUDPSocketHandler.__name__: LineProtocolFormatter( - tags={"tardis"}, resolution=1 + tags={"tardis", "storage_status"}, resolution=1 + ), +} + + +def storage_connection(storage: Storage) -> list: + """ + Log information about the storages connection + :param storage: + :return: + """ + results = [ + { + "storage": repr(storage), + "throughput": storage.connection.throughput, + "requested_throughput": sum(storage.connection._subscriptions.values()), + "throughput_scale": storage.connection._throughput_scale, + } + ] + return results + + +storage_connection.name = "storage_connection" +storage_connection.whitelist = (Storage,) +storage_connection.logging_formatter = { + LoggingSocketHandler.__name__: JsonFormatter(), + logging.StreamHandler.__name__: JsonFormatter(), + LoggingUDPSocketHandler.__name__: LineProtocolFormatter( + tags={"tardis", "storage_connection"}, resolution=1 + ), +} + + +def remote_connection(remote: Pipe) -> list: + """ + Log information about the remote connection + :param remote: + :return: + """ + results = [ + { + "throughput": remote.throughput, + # "requested_throughput": sum(remote._subscriptions.values()), + "requested_throughput": str(remote._subscriptions.values()), + "throughput_scale": remote._throughput_scale, + } + ] + return results + + +remote_connection.name = "remote_connection" +remote_connection.whitelist = (Pipe,) +remote_connection.logging_formatter = { + LoggingSocketHandler.__name__: JsonFormatter(), + logging.StreamHandler.__name__: JsonFormatter(), + LoggingUDPSocketHandler.__name__: LineProtocolFormatter( + tags={"tardis", "storage_connection"}, resolution=1 ), } diff --git a/lapis/simulator.py b/lapis/simulator.py index 00a8056..c55e80e 100644 --- a/lapis/simulator.py +++ b/lapis/simulator.py @@ -15,6 +15,8 @@ configuration_information, job_events, storage_status, + storage_connection, + remote_connection, ) from lapis.monitor import Monitoring from lapis.monitor.cobald import drone_statistics, pool_statistics @@ -49,6 +51,8 @@ def enable_monitoring(self): self.monitoring.register_statistic(pool_status) self.monitoring.register_statistic(configuration_information) self.monitoring.register_statistic(storage_status) + self.monitoring.register_statistic(storage_connection) + self.monitoring.register_statistic(remote_connection) def create_job_generator(self, job_input, job_reader): self._job_generators.append((job_input, job_reader)) diff --git a/lapis/storage.py b/lapis/storage.py index 02a3d0a..9687095 100644 --- a/lapis/storage.py +++ b/lapis/storage.py @@ -212,10 +212,12 @@ class LookUpInformation(NamedTuple): ) def __repr__(self): - return "{name} on site {site}: {used}MB of {tot}MB used ({div} %)".format( - name=self.name, - site=self.sitename, - used=self.usedstorage, - tot=self.storagesize, - div=100.0 * self.usedstorage / self.storagesize, - ) + return "<%s: %s>" % (self.__class__.__name__, self.name or id(self)) + + # return "{name} on site {site}: {used}MB of {tot}MB used ({div} %)".format( + # name=self.name, + # site=self.sitename, + # used=self.usedstorage, + # tot=self.storagesize, + # div=100.0 * self.usedstorage / self.storagesize, + # ) From 4405b68a8ac5367e73ccd5944c83a0c21cdab6b0 Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Tue, 12 Nov 2019 10:38:20 +0100 Subject: [PATCH 371/648] small fix in monitoring --- lapis/monitor/general.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/lapis/monitor/general.py b/lapis/monitor/general.py index 30f3f21..876e06e 100644 --- a/lapis/monitor/general.py +++ b/lapis/monitor/general.py @@ -285,8 +285,7 @@ def remote_connection(remote: Pipe) -> list: results = [ { "throughput": remote.throughput, - # "requested_throughput": sum(remote._subscriptions.values()), - "requested_throughput": str(remote._subscriptions.values()), + "requested_throughput": sum(remote._subscriptions.values()), "throughput_scale": remote._throughput_scale, } ] From 08ab6e0a5ec5872a30180e3a7f12b7336eae0872 Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Sun, 17 Nov 2019 14:24:28 +0100 Subject: [PATCH 372/648] adapted job walltime to new job processing in order to fix job event monitoring --- lapis/job.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/lapis/job.py b/lapis/job.py index 5d4976b..fbf6fa3 100644 --- a/lapis/job.py +++ b/lapis/job.py @@ -16,6 +16,7 @@ class Job(object): "resources", "used_resources", "_walltime", + "_streamtime", "requested_walltime", "queue_date", "requested_inputfiles", @@ -61,6 +62,7 @@ def __init__( ) self.resources[key] = self.used_resources[key] self._walltime = used_resources.pop("walltime") + self._streamtime = 0 self.requested_walltime = resources.pop("walltime", None) self.requested_inputfiles = resources.pop("inputfiles", None) self.used_inputfiles = used_resources.pop("inputfiles", None) @@ -97,7 +99,7 @@ def walltime(self) -> float: """ :return: Time that passes while job is running """ - return self._walltime + return self._streamtime + self.calculation_time() def calculation_time(self): print("WALLTIME: Job {} @ {}".format(repr(self), time.now)) @@ -106,12 +108,13 @@ def calculation_time(self): async def transfer_inputfiles(self): print("TRANSFERING INPUTFILES: Job {} @ {}".format(repr(self), time.now)) if self.drone.fileprovider and self.used_inputfiles: - stream_time = await self.drone.fileprovider.transfer_inputfiles( + self._streamtime = await self.drone.fileprovider.transfer_inputfiles( self.drone, self.requested_inputfiles, repr(self) ) + print( "streamed inputfiles {} for job {} in {} timeunits, finished @ {}".format( - self.requested_inputfiles.keys(), repr(self), stream_time, time.now + self.requested_inputfiles.keys(), repr(self), self._streamtime, time.now ) ) From 331ee5eaa8e0125895f571997a90e2e490b7f746 Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Sun, 17 Nov 2019 14:26:10 +0100 Subject: [PATCH 373/648] minor clean ups --- lapis/storage.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/lapis/storage.py b/lapis/storage.py index 9687095..2e37152 100644 --- a/lapis/storage.py +++ b/lapis/storage.py @@ -23,10 +23,10 @@ class Storage(object): def __init__( self, - name: str, - sitename: str, - storagesize: int, - throughput_limit: int = 1, + name: str = None, + sitename: str = None, + storagesize: int = 1000, + throughput_limit: int = 10, files: Optional[dict] = None, ): self.name = name @@ -48,8 +48,9 @@ def _initial_used_storage(self): def _dict_to_file_object(self, files): files_set = set() - for filename, filespecs in files.items(): - files_set.add(StoredFile(filename, filespecs)) + if files: + for filename, filespecs in files.items(): + files_set.add(StoredFile(filename, filespecs)) return files_set @property From a94bc8ac725ae549063db891b0171043d8c71e0f Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Sun, 17 Nov 2019 14:27:56 +0100 Subject: [PATCH 374/648] added cache modelation via cachehitrate --- lapis/file_provider.py | 69 +++++++++++++++++++++++++++++++++++++++--- lapis/simulator.py | 2 +- 2 files changed, 65 insertions(+), 6 deletions(-) diff --git a/lapis/file_provider.py b/lapis/file_provider.py index 003f3b6..909da6e 100644 --- a/lapis/file_provider.py +++ b/lapis/file_provider.py @@ -1,3 +1,5 @@ +import logging + from lapis.storage import Storage from lapis.files import RequestedFile from lapis.monitor import sampling_required @@ -8,11 +10,18 @@ class FileProvider(object): - __slots__ = ("storages", "remote_connection") + __slots__ = ( + "storages", + "remote_connection", + "cache_hitrate_approach", + "cachehitrate", + ) - def __init__(self, throughput=20): + def __init__(self, throughput=100, cache_hitrate_approach=False): self.storages = dict() self.remote_connection = Pipe(throughput=throughput) + self.cache_hitrate_approach = cache_hitrate_approach + self.cachehitrate = 0.6 def add_storage_element(self, storage_element: Storage): """ @@ -80,8 +89,8 @@ async def stream_file(self, requested_file: RequestedFile, dronesite, job_repr): await sampling_required.put(used_connection) potential_cache = random.choice(self.storages.get(dronesite, None)) await used_connection.transfer(requested_file.filesize) - - await potential_cache.apply_caching_decision(requested_file, job_repr) + if potential_cache: + await potential_cache.apply_caching_decision(requested_file, job_repr) else: await sampling_required.put(used_connection) @@ -106,9 +115,59 @@ async def transfer_inputfiles(self, drone, requested_files: dict, job_repr): async with Scope() as scope: for inputfilename, inputfilespecs in requested_files.items(): requested_file = RequestedFile(inputfilename, inputfilespecs) - scope.do(self.stream_file(requested_file, drone.sitename, job_repr)) + if self.cache_hitrate_approach: + + scope.do( + self.transfer_by_cache_hitrate( + self.storages.get(drone.sitename, None), requested_file + ) + ) + else: + scope.do(self.stream_file(requested_file, drone.sitename, job_repr)) stream_time = time.now - start_time print( "STREAMED files {} in {}".format(list(requested_files.keys()), stream_time) ) return stream_time + + async def transfer_by_cache_hitrate( + self, available_storages: Storage, requested_file: RequestedFile + ): + if not available_storages and self.cachehitrate: + logging.getLogger("implementation").error( + "no available caches for drone " + " requested cachehitrate was " + "{}".format(self.cachehitrate) + ) + else: + if 0 < self.cachehitrate < 1: + async with Scope() as scope: + scope.do( + self.transfer_wrapper( + self.remote_connection, + (1.0 - self.cachehitrate) * requested_file.filesize, + ) + ) + scope.do( + self.transfer_wrapper( + available_storages[0].connection, + self.cachehitrate * requested_file.filesize, + ) + ) + elif self.cachehitrate == 1: + await available_storages[0].connection.transfer(requested_file.filesize) + elif self.cachehitrate == 0: + await self.remote_connection.transfer(requested_file.filesize) + + async def transfer_wrapper(self, connection, total): + print( + "transfering {} with {}, start @ {}".format( + total, connection.throughput, time.now + ) + ) + await connection.transfer(total=total) + print( + "transfering {} with {}, stop @ {}".format( + total, connection.throughput, time.now + ) + ) diff --git a/lapis/simulator.py b/lapis/simulator.py index c55e80e..5d94119 100644 --- a/lapis/simulator.py +++ b/lapis/simulator.py @@ -30,7 +30,7 @@ def __init__(self, seed=1234): random.seed(seed) self.job_queue = Queue() self.pools = [] - self.fileprovider = FileProvider() + self.fileprovider = FileProvider(cache_hitrate_approach=True) self.controllers = [] self.job_scheduler = None self.job_generator = None From 14b7e1830cddd12e47d16cbc78d76b6d438ae94e Mon Sep 17 00:00:00 2001 From: tfesenbecker <50665055+tfesenbecker@users.noreply.github.com> Date: Sun, 17 Nov 2019 14:36:32 +0100 Subject: [PATCH 375/648] Update lapis/cachealgorithm.py Co-Authored-By: Max Fischer --- lapis/cachealgorithm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapis/cachealgorithm.py b/lapis/cachealgorithm.py index 6064de5..be762cf 100644 --- a/lapis/cachealgorithm.py +++ b/lapis/cachealgorithm.py @@ -47,6 +47,6 @@ def consider(self, candidate: RequestedFile) -> Optional[Set[RequestedFile]]: if self._storage.free_space() < candidate.filesize: return self._context_based_consideration(candidate) else: - return {} + return set() else: return {candidate} From 9c8910086ef4cd9e420dcd64fb44a8e5c1ff145f Mon Sep 17 00:00:00 2001 From: tfesenbecker <50665055+tfesenbecker@users.noreply.github.com> Date: Sun, 17 Nov 2019 14:47:55 +0100 Subject: [PATCH 376/648] Update lapis/storage.py Co-Authored-By: Max Fischer --- lapis/storage.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapis/storage.py b/lapis/storage.py index 2e37152..1450d9a 100644 --- a/lapis/storage.py +++ b/lapis/storage.py @@ -66,7 +66,7 @@ def find_file(self, filename): :param filename: :return: """ - for file in self.files: + return filename in self.files if file.filename == filename: return file From ecd5ca06d62c0ad5ee6a1f16fc7818c5c818085b Mon Sep 17 00:00:00 2001 From: tfesenbecker <50665055+tfesenbecker@users.noreply.github.com> Date: Sun, 17 Nov 2019 14:50:18 +0100 Subject: [PATCH 377/648] Update lapis/storage.py Co-Authored-By: Max Fischer --- lapis/storage.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapis/storage.py b/lapis/storage.py index 1450d9a..ccdd740 100644 --- a/lapis/storage.py +++ b/lapis/storage.py @@ -36,7 +36,7 @@ def __init__( self.storagesize = storagesize self.files = self._dict_to_file_object(files) self.filenames = set(file.filename for file in self.files) - self._usedstorage = Resources(**dict(usedsize=self._initial_used_storage())) + self._usedstorage = Resources(usedsize=sum(file.filesize for file in self.files)) self.cachealgorithm = CacheAlgorithm(self) self.connection = Pipe(throughput_limit) self.__repr__() From 7a0d535333a97d4fc29d17560a0a0b2677ea6572 Mon Sep 17 00:00:00 2001 From: tfesenbecker <50665055+tfesenbecker@users.noreply.github.com> Date: Sun, 17 Nov 2019 14:53:16 +0100 Subject: [PATCH 378/648] Update lapis/storage.py Co-Authored-By: Max Fischer --- lapis/storage.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapis/storage.py b/lapis/storage.py index ccdd740..9e98ceb 100644 --- a/lapis/storage.py +++ b/lapis/storage.py @@ -55,7 +55,7 @@ def _dict_to_file_object(self, files): @property def usedstorage(self): - return dict(self._usedstorage.levels)["usedsize"] + return self._usedstorage.levels.usedsize def free_space(self): return self.storagesize - self.usedstorage From 2a723314106f8eb7e71e58d81dcdaa512715a88a Mon Sep 17 00:00:00 2001 From: tfesenbecker <50665055+tfesenbecker@users.noreply.github.com> Date: Sun, 17 Nov 2019 14:53:55 +0100 Subject: [PATCH 379/648] Update lapis/storage.py Co-Authored-By: Max Fischer --- lapis/storage.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapis/storage.py b/lapis/storage.py index 9e98ceb..56790f1 100644 --- a/lapis/storage.py +++ b/lapis/storage.py @@ -60,7 +60,7 @@ def usedstorage(self): def free_space(self): return self.storagesize - self.usedstorage - def find_file(self, filename): + def find_file(self, filename: str) -> bool: """ Searches storage object for file with passed filename :param filename: From 2a3ec9ba1558f54a4dec24f9d4cb9e7e258d95ec Mon Sep 17 00:00:00 2001 From: tfesenbecker <50665055+tfesenbecker@users.noreply.github.com> Date: Sun, 17 Nov 2019 14:55:15 +0100 Subject: [PATCH 380/648] Update lapis/storage.py Co-Authored-By: Max Fischer --- lapis/storage.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapis/storage.py b/lapis/storage.py index 56790f1..7678a43 100644 --- a/lapis/storage.py +++ b/lapis/storage.py @@ -84,7 +84,7 @@ async def remove_from_storage(self, file: StoredFile, job_repr): ) ) await (time + self.deletion_duration) - await self._usedstorage.decrease(**{"usedsize": file.filesize}) + await self._usedstorage.decrease(usedsize=file.filesize) self.filenames.remove(file.filename) self.files.remove(file) From 04dbb190bad5ab14ca6159701265813946b4738b Mon Sep 17 00:00:00 2001 From: tfesenbecker <50665055+tfesenbecker@users.noreply.github.com> Date: Sun, 17 Nov 2019 14:56:35 +0100 Subject: [PATCH 381/648] Update lapis/storage.py Co-Authored-By: Max Fischer --- lapis/storage.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapis/storage.py b/lapis/storage.py index 7678a43..4440d6e 100644 --- a/lapis/storage.py +++ b/lapis/storage.py @@ -107,7 +107,7 @@ async def add_to_storage(self, file: RequestedFile, job_repr): file = file.convert_to_stored_file_object(time.now) print(file.filesize) await self.connection.transfer(file.filesize) - await self._usedstorage.increase(**{"usedsize": file.filesize}) + await self._usedstorage.increase(usedsize=file.filesize) self.filenames.add(file.filename) self.files.add(file) From a5ea3e1c048a2fde86256f118744fc930491893d Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Sun, 17 Nov 2019 18:04:24 +0100 Subject: [PATCH 382/648] resolved PEP8 issue --- lapis/storage.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lapis/storage.py b/lapis/storage.py index 4440d6e..ba58303 100644 --- a/lapis/storage.py +++ b/lapis/storage.py @@ -36,7 +36,9 @@ def __init__( self.storagesize = storagesize self.files = self._dict_to_file_object(files) self.filenames = set(file.filename for file in self.files) - self._usedstorage = Resources(usedsize=sum(file.filesize for file in self.files)) + self._usedstorage = Resources( + usedsize=sum(file.filesize for file in self.files) + ) self.cachealgorithm = CacheAlgorithm(self) self.connection = Pipe(throughput_limit) self.__repr__() @@ -67,8 +69,6 @@ def find_file(self, filename: str) -> bool: :return: """ return filename in self.files - if file.filename == filename: - return file async def remove_from_storage(self, file: StoredFile, job_repr): """ From de319a9dd38b18775a13d9003be2fb23547da57d Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Tue, 19 Nov 2019 14:10:46 +0100 Subject: [PATCH 383/648] minor fix --- lapis/job.py | 8 ++++---- lapis/job_io/htcondor.py | 2 ++ lapis/storage_io/storage.py | 2 +- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/lapis/job.py b/lapis/job.py index fbf6fa3..d203caf 100644 --- a/lapis/job.py +++ b/lapis/job.py @@ -112,11 +112,11 @@ async def transfer_inputfiles(self): self.drone, self.requested_inputfiles, repr(self) ) - print( - "streamed inputfiles {} for job {} in {} timeunits, finished @ {}".format( - self.requested_inputfiles.keys(), repr(self), self._streamtime, time.now + print( + "streamed inputfiles {} for job {} in {} timeunits, finished @ {}".format( + self.requested_inputfiles.keys(), repr(self), self._streamtime, time.now + ) ) - ) async def run(self): self.in_queue_until = time.now diff --git a/lapis/job_io/htcondor.py b/lapis/job_io/htcondor.py index 60a3684..f22be5e 100644 --- a/lapis/job_io/htcondor.py +++ b/lapis/job_io/htcondor.py @@ -71,6 +71,8 @@ def htcondor_job_reader( ) try: + if not entry["Inputfiles"]: + del entry["Inputfiles"] resources["inputfiles"] = deepcopy(entry["Inputfiles"]) used_resources["inputfiles"] = deepcopy(entry["Inputfiles"]) for filename, filespecs in entry["Inputfiles"].items(): diff --git a/lapis/storage_io/storage.py b/lapis/storage_io/storage.py index 88a0ed6..a4f5d8c 100644 --- a/lapis/storage_io/storage.py +++ b/lapis/storage_io/storage.py @@ -9,7 +9,7 @@ def storage_reader(storage, storage_content): yield Storage( name=row["name"], sitename=row["sitename"], - storagesize=int(row["cachesizeMB"]), + storagesize=int(row["cachesizeGB"]), files=storage_content[row["name"]], ) From 93b9d76ed848ea469b24d5d84ebdb82a4b5c1c30 Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Wed, 20 Nov 2019 15:34:39 +0100 Subject: [PATCH 384/648] moved definition of remote throughput to CLI input, storage object throughput to storage input file and caching via fixed cache hitrate to CLI input --- lapis/cli/simulate.py | 6 +++- lapis/file_provider.py | 15 +++------ lapis/monitor/general.py | 2 +- lapis/simulator.py | 11 ++++++- lapis/storage.py | 64 +++++++++++++++---------------------- lapis/storage_io/storage.py | 1 + 6 files changed, 47 insertions(+), 52 deletions(-) diff --git a/lapis/cli/simulate.py b/lapis/cli/simulate.py index 85d6854..6057c49 100644 --- a/lapis/cli/simulate.py +++ b/lapis/cli/simulate.py @@ -83,8 +83,10 @@ def cli(ctx, seed, until, log_tcp, log_file, log_telegraf): click.Choice(list(storage_import_mapper.keys())), ), ) +@click.option("--remote-throughput", "remote_throughput", type=float, default=10) +@click.option("--cache-hitrate", "cache_hitrate", type=float, default=None) @click.pass_context -def static(ctx, job_file, pool_file, storage_files): +def static(ctx, job_file, pool_file, storage_files, remote_throughput, cache_hitrate): click.echo("starting static environment") simulator = Simulator(seed=ctx.obj["seed"]) file, file_type = job_file @@ -92,6 +94,8 @@ def static(ctx, job_file, pool_file, storage_files): job_input=file, job_reader=job_import_mapper[file_type] ) simulator.create_scheduler(scheduler_type=CondorJobScheduler) + simulator.create_connection_module(remote_throughput, cache_hitrate) + for current_pool in pool_file: pool_file, pool_file_type = current_pool simulator.create_pools( diff --git a/lapis/file_provider.py b/lapis/file_provider.py index 909da6e..166dfdf 100644 --- a/lapis/file_provider.py +++ b/lapis/file_provider.py @@ -10,18 +10,12 @@ class FileProvider(object): - __slots__ = ( - "storages", - "remote_connection", - "cache_hitrate_approach", - "cachehitrate", - ) + __slots__ = ("storages", "remote_connection", "cachehitrate") - def __init__(self, throughput=100, cache_hitrate_approach=False): + def __init__(self, throughput=100, cache_hitrate=None): self.storages = dict() self.remote_connection = Pipe(throughput=throughput) - self.cache_hitrate_approach = cache_hitrate_approach - self.cachehitrate = 0.6 + self.cachehitrate = cache_hitrate def add_storage_element(self, storage_element: Storage): """ @@ -111,11 +105,12 @@ async def transfer_inputfiles(self, drone, requested_files: dict, job_repr): :param job_repr: :return: """ + print("registered caches", self.storages) start_time = time.now async with Scope() as scope: for inputfilename, inputfilespecs in requested_files.items(): requested_file = RequestedFile(inputfilename, inputfilespecs) - if self.cache_hitrate_approach: + if self.cachehitrate is not None: scope.do( self.transfer_by_cache_hitrate( diff --git a/lapis/monitor/general.py b/lapis/monitor/general.py index 876e06e..eb8b8a0 100644 --- a/lapis/monitor/general.py +++ b/lapis/monitor/general.py @@ -231,7 +231,7 @@ def storage_status(storage: Storage) -> list: "storage": repr(storage), "usedstorage": storage.usedstorage, "storagesize": storage.storagesize, - "numberoffiles": len(storage.filenames), + "numberoffiles": len(storage.files), } ] return results diff --git a/lapis/simulator.py b/lapis/simulator.py index 5d94119..dedb2bb 100644 --- a/lapis/simulator.py +++ b/lapis/simulator.py @@ -30,7 +30,7 @@ def __init__(self, seed=1234): random.seed(seed) self.job_queue = Queue() self.pools = [] - self.fileprovider = FileProvider(cache_hitrate_approach=True) + self.fileprovider: FileProvider self.controllers = [] self.job_scheduler = None self.job_generator = None @@ -59,6 +59,9 @@ def create_job_generator(self, job_input, job_reader): def create_pools(self, pool_input, pool_reader, pool_type, controller=None): assert self.job_scheduler, "Scheduler needs to be created before pools" + assert self.fileprovider, ( + "Connection module needs to be created before " "storages" + ) for pool in pool_reader( iterable=pool_input, pool_type=pool_type, @@ -69,6 +72,9 @@ def create_pools(self, pool_input, pool_reader, pool_type, controller=None): self.controllers.append(controller(target=pool, rate=1)) def create_storage(self, storage_input, storage_content_input, storage_reader): + assert self.fileprovider, ( + "Connection module needs to be created before " "storages" + ) for storage in storage_reader( storage=storage_input, storage_content=storage_content_input ): @@ -77,6 +83,9 @@ def create_storage(self, storage_input, storage_content_input, storage_reader): def create_scheduler(self, scheduler_type): self.job_scheduler = scheduler_type(job_queue=self.job_queue) + def create_connection_module(self, remote_throughput, cache_hitrate=None): + self.fileprovider = FileProvider(remote_throughput, cache_hitrate) + def run(self, until=None): print(f"running until {until}") run(self._simulate(until)) diff --git a/lapis/storage.py b/lapis/storage.py index ba58303..06770fe 100644 --- a/lapis/storage.py +++ b/lapis/storage.py @@ -34,27 +34,17 @@ def __init__( self.deletion_duration = 5 self.update_duration = 1 self.storagesize = storagesize - self.files = self._dict_to_file_object(files) - self.filenames = set(file.filename for file in self.files) + self.files = { + filename: StoredFile(filename, filespecs) + for filename, filespecs in files.items() + } self._usedstorage = Resources( - usedsize=sum(file.filesize for file in self.files) + usedsize=sum(file.filesize for file in list(self.files.values())) ) self.cachealgorithm = CacheAlgorithm(self) self.connection = Pipe(throughput_limit) self.__repr__() - def _initial_used_storage(self): - initial_value = sum(file.filesize for file in self.files) - print("{} set initial value {}".format(self.name, initial_value)) - return initial_value - - def _dict_to_file_object(self, files): - files_set = set() - if files: - for filename, filespecs in files.items(): - files_set.add(StoredFile(filename, filespecs)) - return files_set - @property def usedstorage(self): return self._usedstorage.levels.usedsize @@ -62,13 +52,16 @@ def usedstorage(self): def free_space(self): return self.storagesize - self.usedstorage - def find_file(self, filename: str) -> bool: + def find_file(self, filename: str) -> StoredFile: """ Searches storage object for file with passed filename :param filename: - :return: + :return: corresponding file object """ - return filename in self.files + try: + return self.files[filename] + except KeyError: + raise KeyError async def remove_from_storage(self, file: StoredFile, job_repr): """ @@ -85,8 +78,8 @@ async def remove_from_storage(self, file: StoredFile, job_repr): ) await (time + self.deletion_duration) await self._usedstorage.decrease(usedsize=file.filesize) - self.filenames.remove(file.filename) - self.files.remove(file) + # self.filenames.remove(file.filename) + self.files.pop(file.filename) async def add_to_storage(self, file: RequestedFile, job_repr): """ @@ -105,11 +98,10 @@ async def add_to_storage(self, file: RequestedFile, job_repr): ) ) file = file.convert_to_stored_file_object(time.now) - print(file.filesize) await self.connection.transfer(file.filesize) await self._usedstorage.increase(usedsize=file.filesize) - self.filenames.add(file.filename) - self.files.add(file) + # self.filenames.add(file.filename) + self.files[file.filename] = file async def update_file(self, stored_file: StoredFile, job_repr): """ @@ -192,25 +184,19 @@ class LookUpInformation(NamedTuple): cached_filesize: int storage: Storage - if queue: - try: - print(self.find_file(requested_file.filename).filesize) - await queue.put( - LookUpInformation( - self.find_file(requested_file.filename).filesize, self - ) + try: + await queue.put( + LookUpInformation( + self.find_file(requested_file.filename).filesize, self ) - except AttributeError: - print( - "File {} not cached on any reachable storage".format( - requested_file.filename - ) + ) + except KeyError: + print( + "File {} not cached on any reachable storage".format( + requested_file.filename ) - await queue.put(LookUpInformation(0, self)) - else: - return LookUpInformation( - self.find_file(requested_file.filename).filesize, self ) + await queue.put(LookUpInformation(0, self)) def __repr__(self): return "<%s: %s>" % (self.__class__.__name__, self.name or id(self)) diff --git a/lapis/storage_io/storage.py b/lapis/storage_io/storage.py index a4f5d8c..5be532e 100644 --- a/lapis/storage_io/storage.py +++ b/lapis/storage_io/storage.py @@ -10,6 +10,7 @@ def storage_reader(storage, storage_content): name=row["name"], sitename=row["sitename"], storagesize=int(row["cachesizeGB"]), + throughput_limit=int(row["throughput_limit"]), files=storage_content[row["name"]], ) From 846bccb6d047601c5ed7fe1949fc0a095a5268e4 Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Wed, 20 Nov 2019 16:09:23 +0100 Subject: [PATCH 385/648] Extended cache algorithm documentation --- lapis/cachealgorithm.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/lapis/cachealgorithm.py b/lapis/cachealgorithm.py index be762cf..b3c5175 100644 --- a/lapis/cachealgorithm.py +++ b/lapis/cachealgorithm.py @@ -6,6 +6,13 @@ class CacheAlgorithm(object): def __init__(self, storage, additional_information: Optional[str] = None): + """ + Cache Algorithm class defining the handling of uncached files. + It's functionality is called via the consider() function. + :param storage: storage object that this algorithm is + :param additional_information: placeholder for additional external + information that might be passed to the cache algoritm. + """ self._storage = storage self._additional_information = additional_information @@ -43,6 +50,14 @@ def _context_based_consideration(self, candidate: RequestedFile): return {candidate} def consider(self, candidate: RequestedFile) -> Optional[Set[RequestedFile]]: + """ + Decides whether the requested file should be cached. + The decision is split into a decision that is based on the + requested file only and a decision that takes the overall context (current + cache state, other cached files) into account. + :param candidate: + :return: + """ if self._file_based_consideration(candidate): if self._storage.free_space() < candidate.filesize: return self._context_based_consideration(candidate) From a00f366a195451972daf1095692d7db82d8e987c Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Wed, 20 Nov 2019 17:07:21 +0100 Subject: [PATCH 386/648] implemented minor changes requested in PRs --- lapis/cachealgorithm.py | 2 +- lapis/{file_provider.py => connection.py} | 17 +++++--- lapis/drone.py | 6 +-- lapis/files.py | 50 +++++------------------ lapis/job.py | 24 ++++++----- lapis/simulator.py | 14 +++---- lapis/storage.py | 24 +++++------ 7 files changed, 58 insertions(+), 79 deletions(-) rename lapis/{file_provider.py => connection.py} (93%) diff --git a/lapis/cachealgorithm.py b/lapis/cachealgorithm.py index b3c5175..6f5048b 100644 --- a/lapis/cachealgorithm.py +++ b/lapis/cachealgorithm.py @@ -11,7 +11,7 @@ def __init__(self, storage, additional_information: Optional[str] = None): It's functionality is called via the consider() function. :param storage: storage object that this algorithm is :param additional_information: placeholder for additional external - information that might be passed to the cache algoritm. + information that might be passed to the cache algoritm. """ self._storage = storage self._additional_information = additional_information diff --git a/lapis/file_provider.py b/lapis/connection.py similarity index 93% rename from lapis/file_provider.py rename to lapis/connection.py index 166dfdf..7adc635 100644 --- a/lapis/file_provider.py +++ b/lapis/connection.py @@ -8,7 +8,7 @@ import random -class FileProvider(object): +class Connection(object): __slots__ = ("storages", "remote_connection", "cachehitrate") @@ -19,7 +19,8 @@ def __init__(self, throughput=100, cache_hitrate=None): def add_storage_element(self, storage_element: Storage): """ - Register storage element in FileProvider clustering storage elements by sitename + Register storage element in Connetion module clustering storage elements by + sitename :param storage_element: :return: """ @@ -36,7 +37,7 @@ async def determine_inputfile_source( cached in a storage element and the storage element for all reachable storage objects on the drone's site. The tuples are sorted by amount of cached data and the storage object where the biggest part of the file is cached is - returned. If the file is not cached in any storage object the fileproviders + returned. If the file is not cached in any storage object the connection module remote connection is returned. :param requested_file: :param dronesite: @@ -79,9 +80,11 @@ async def stream_file(self, requested_file: RequestedFile, dronesite, job_repr): requested_file, dronesite, job_repr ) - if used_connection == self.remote_connection: + if used_connection == self.remote_connection and self.storages.get( + dronesite, None + ): await sampling_required.put(used_connection) - potential_cache = random.choice(self.storages.get(dronesite, None)) + potential_cache = random.choice(self.storages[dronesite]) await used_connection.transfer(requested_file.filesize) if potential_cache: await potential_cache.apply_caching_decision(requested_file, job_repr) @@ -109,7 +112,9 @@ async def transfer_inputfiles(self, drone, requested_files: dict, job_repr): start_time = time.now async with Scope() as scope: for inputfilename, inputfilespecs in requested_files.items(): - requested_file = RequestedFile(inputfilename, inputfilespecs) + requested_file = RequestedFile( + inputfilename, inputfilespecs["filesize"] + ) if self.cachehitrate is not None: scope.do( diff --git a/lapis/drone.py b/lapis/drone.py index 388b301..ac5160d 100644 --- a/lapis/drone.py +++ b/lapis/drone.py @@ -4,7 +4,7 @@ from typing import Optional from lapis.job import Job -from lapis.file_provider import FileProvider +from lapis.connection import Connection class ResourcesExceeded(Exception): @@ -15,7 +15,7 @@ class Drone(interfaces.Pool): def __init__( self, scheduler, - fileprovider: FileProvider = FileProvider(), + connection: Connection = Connection(), pool_resources: Optional[dict] = None, scheduling_duration: Optional[float] = None, ignore_resources: list = None, @@ -28,7 +28,7 @@ def __init__( """ super(Drone, self).__init__() self.scheduler = scheduler - self.fileprovider = fileprovider + self.connection = connection self.sitename = sitename self.pool_resources = pool_resources self.resources = Capacities(**pool_resources) diff --git a/lapis/files.py b/lapis/files.py index 68727c1..9158674 100644 --- a/lapis/files.py +++ b/lapis/files.py @@ -1,55 +1,27 @@ -from typing import Optional +from typing import Optional, NamedTuple class StoredFile(object): def __init__(self, filename, filespecs): self.filename = filename - self._filesize: Optional[int] = filespecs.get("filesize", None) - self._storedsize: Optional[int] = filespecs.get("storedsize", self._filesize) - self._cachedsince: Optional[int] = filespecs.get("cachedsince", None) - self._lastaccessed: Optional[int] = filespecs.get("lastaccessed", None) - self._numberofaccesses: int = filespecs.get("numberofaccesses", 0) - - @property - def filesize(self): - return self._filesize - - @property - def cachedsince(self): - return self._cachedsince - - @property - def lastaccessed(self): - return self._lastaccessed - - @property - def numberofaccesses(self): - return self._numberofaccesses - - @cachedsince.setter - def cachedsince(self, value: int): - self._cachedsince = value - - @lastaccessed.setter - def lastaccessed(self, value: int): - self._lastaccessed = value + self.filesize: Optional[int] = filespecs.get("filesize", None) + self.storedsize: Optional[int] = filespecs.get("storedsize", self.filesize) + self.cachedsince: Optional[int] = filespecs.get("cachedsince", None) + self.lastaccessed: Optional[int] = filespecs.get("lastaccessed", None) + self.numberofaccesses: int = filespecs.get("numberofaccesses", 0) def increment_accesses(self): self._numberofaccesses += 1 -class RequestedFile(object): - def __init__(self, filename: str, filespecs: dict): - self.filename: str = filename - self._filesize: Optional[int] = filespecs.get("filesize", None) - - @property - def filesize(self): - return self._filesize +class RequestedFile(NamedTuple): + filename: str + filesize: Optional[int] = None def convert_to_stored_file_object(self, currenttime): + print(self.filesize) filespecs = dict( - filesize=self._filesize, + filesize=self.filesize, cachedsince=currenttime, lastaccessed=currenttime, numberofaccesses=1, diff --git a/lapis/job.py b/lapis/job.py index d203caf..331bc4e 100644 --- a/lapis/job.py +++ b/lapis/job.py @@ -16,6 +16,7 @@ class Job(object): "resources", "used_resources", "_walltime", + "_calculationtime", "_streamtime", "requested_walltime", "queue_date", @@ -62,6 +63,7 @@ def __init__( ) self.resources[key] = self.used_resources[key] self._walltime = used_resources.pop("walltime") + self._calculationtime = walltime_models["maxeff"](self, self._walltime) self._streamtime = 0 self.requested_walltime = resources.pop("walltime", None) self.requested_inputfiles = resources.pop("inputfiles", None) @@ -99,22 +101,26 @@ def walltime(self) -> float: """ :return: Time that passes while job is running """ - return self._streamtime + self.calculation_time() + return self._streamtime + self.calculation_time + @property def calculation_time(self): print("WALLTIME: Job {} @ {}".format(repr(self), time.now)) - return walltime_models["maxeff"](self, self._walltime) + return self._calculationtime async def transfer_inputfiles(self): print("TRANSFERING INPUTFILES: Job {} @ {}".format(repr(self), time.now)) - if self.drone.fileprovider and self.used_inputfiles: - self._streamtime = await self.drone.fileprovider.transfer_inputfiles( + if self.drone.connection and self.used_inputfiles: + self._streamtime = await self.drone.connection.transfer_inputfiles( self.drone, self.requested_inputfiles, repr(self) ) print( "streamed inputfiles {} for job {} in {} timeunits, finished @ {}".format( - self.requested_inputfiles.keys(), repr(self), self._streamtime, time.now + self.requested_inputfiles.keys(), + repr(self), + self._streamtime, + time.now, ) ) @@ -129,13 +135,9 @@ async def run(self): ) ) try: -<<<<<<< HEAD - self.recalculate_walltime() - await (time + self._walltime) -======= + await self.transfer_inputfiles() - await (time + self.calculation_time()) ->>>>>>> split processing of job into file transfer and actual calculation + await (time + self.calculation_time) except CancelTask: self.drone = None self._success = False diff --git a/lapis/simulator.py b/lapis/simulator.py index dedb2bb..f216a38 100644 --- a/lapis/simulator.py +++ b/lapis/simulator.py @@ -6,7 +6,7 @@ from lapis.drone import Drone from lapis.job import job_to_queue_scheduler -from lapis.file_provider import FileProvider +from lapis.connection import Connection from lapis.monitor.general import ( user_demand, job_statistics, @@ -30,7 +30,7 @@ def __init__(self, seed=1234): random.seed(seed) self.job_queue = Queue() self.pools = [] - self.fileprovider: FileProvider + self.connection: Connection self.controllers = [] self.job_scheduler = None self.job_generator = None @@ -59,32 +59,32 @@ def create_job_generator(self, job_input, job_reader): def create_pools(self, pool_input, pool_reader, pool_type, controller=None): assert self.job_scheduler, "Scheduler needs to be created before pools" - assert self.fileprovider, ( + assert self.connection, ( "Connection module needs to be created before " "storages" ) for pool in pool_reader( iterable=pool_input, pool_type=pool_type, - make_drone=partial(Drone, self.job_scheduler, self.fileprovider), + make_drone=partial(Drone, self.job_scheduler, self.connection), ): self.pools.append(pool) if controller: self.controllers.append(controller(target=pool, rate=1)) def create_storage(self, storage_input, storage_content_input, storage_reader): - assert self.fileprovider, ( + assert self.connection, ( "Connection module needs to be created before " "storages" ) for storage in storage_reader( storage=storage_input, storage_content=storage_content_input ): - self.fileprovider.add_storage_element(storage) + self.connection.add_storage_element(storage) def create_scheduler(self, scheduler_type): self.job_scheduler = scheduler_type(job_queue=self.job_queue) def create_connection_module(self, remote_throughput, cache_hitrate=None): - self.fileprovider = FileProvider(remote_throughput, cache_hitrate) + self.connection = Connection(remote_throughput, cache_hitrate) def run(self, until=None): print(f"running until {until}") diff --git a/lapis/storage.py b/lapis/storage.py index 06770fe..72cb98c 100644 --- a/lapis/storage.py +++ b/lapis/storage.py @@ -6,6 +6,15 @@ from lapis.cachealgorithm import CacheAlgorithm +class Storage: + pass + + +class LookUpInformation(NamedTuple): + cached_filesize: int + storage: Storage + + class Storage(object): __slots__ = ( @@ -78,14 +87,13 @@ async def remove_from_storage(self, file: StoredFile, job_repr): ) await (time + self.deletion_duration) await self._usedstorage.decrease(usedsize=file.filesize) - # self.filenames.remove(file.filename) self.files.pop(file.filename) async def add_to_storage(self, file: RequestedFile, job_repr): """ Adds file to storage object transfering it through the storage objects connection. This should be sufficient for now because files are only added - to the storage when they are also transfered through the FileProviders remote + to the storage when they are also transfered through the Connections remote connection. If this simulator is extended to include any kind of direct file placement this has to be adapted. :param file: @@ -98,10 +106,9 @@ async def add_to_storage(self, file: RequestedFile, job_repr): ) ) file = file.convert_to_stored_file_object(time.now) - await self.connection.transfer(file.filesize) await self._usedstorage.increase(usedsize=file.filesize) - # self.filenames.add(file.filename) self.files[file.filename] = file + await self.connection.transfer(file.filesize) async def update_file(self, stored_file: StoredFile, job_repr): """ @@ -165,10 +172,7 @@ async def apply_caching_decision(self, requested_file: RequestedFile, job_repr): async def look_up_file(self, requested_file: RequestedFile, queue: Queue, job_repr): """ Searches storage object for the requested_file and sends result (amount of - cached data, storage object) to queue if queue was passed as parameter. - If no queue was passed the result is returned normally. This might be needed - when looking up files during coordination and is to be removed if it's not - necessary. + cached data, storage object) to the queue. :param requested_file: :param queue: :param job_repr: Needed for debug output, will be replaced @@ -180,10 +184,6 @@ async def look_up_file(self, requested_file: RequestedFile, queue: Queue, job_re ) ) - class LookUpInformation(NamedTuple): - cached_filesize: int - storage: Storage - try: await queue.put( LookUpInformation( From 6b6422df2c15da05623254852e2da17ec30e5d26 Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Wed, 20 Nov 2019 17:54:37 +0100 Subject: [PATCH 387/648] Update lapis/storage.py --- lapis/storage.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/lapis/storage.py b/lapis/storage.py index 72cb98c..3ca78bc 100644 --- a/lapis/storage.py +++ b/lapis/storage.py @@ -6,13 +6,9 @@ from lapis.cachealgorithm import CacheAlgorithm -class Storage: - pass - - class LookUpInformation(NamedTuple): cached_filesize: int - storage: Storage + storage: "Storage" class Storage(object): From ee7c20d28ba68d20f7c13ffe15facd4a277d184b Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Tue, 26 Nov 2019 12:51:45 +0100 Subject: [PATCH 388/648] updated usim version requirement to 0.4.2 --- pyproject.toml | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 3217107..675c1b7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -12,17 +12,22 @@ description-file = "README.rst" keywords = "htcondor simulation python cobald tardis opportunistic scheduling scheduler" classifiers = [ "License :: OSI Approved :: MIT License", - "Development Status :: 2 - Pre-Alpha", - "Intended Audience :: Developers", - "Intended Audience :: Information Technology", - "Intended Audience :: Science/Research", - "Intended Audience :: System Administrators", - "Topic :: Adaptive Technologies", - "Topic :: Office/Business :: Scheduling", - "Topic :: System :: Distributed Computing", - "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.6", - "Programming Language :: Python :: 3.7", + 'Development Status :: 2 - Pre-Alpha', + 'Intended Audience :: Developers', + 'Intended Audience :: Information Technology', + 'Intended Audience :: Science/Research', + 'Intended Audience :: System Administrators', + 'Topic :: Adaptive Technologies', + 'Topic :: Office/Business :: Scheduling', + 'Topic :: System :: Distributed Computing', + 'Programming Language :: Python :: 3', + 'Programming Language :: Python :: 3.6', + 'Programming Language :: Python :: 3.7' +] +requires = [ + "cobald", + "usim == 0.4.2", + "click", ] requires = ["cobald", "usim == 0.4", "click"] From 7dc67c47f78d07d926a32225b8e8fcda89ea302d Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Tue, 26 Nov 2019 14:21:02 +0100 Subject: [PATCH 389/648] completed renaming of file provider to connection --- lapis_tests/test_job.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lapis_tests/test_job.py b/lapis_tests/test_job.py index 208581c..4fd96ae 100644 --- a/lapis_tests/test_job.py +++ b/lapis_tests/test_job.py @@ -4,8 +4,8 @@ from lapis.drone import Drone from lapis.job import Job -from lapis.file_provider import FileProvider -from lapis_tests import via_usim, DummyScheduler +from lapis_tests import via_usim, DummyScheduler, DummyDrone +from lapis.connection import Connection class TestJob(object): @@ -49,7 +49,7 @@ async def test_job_in_drone(self): scheduler=scheduler, pool_resources={"cores": 1, "memory": 1}, scheduling_duration=0, - fileprovider=FileProvider(), + connection=Connection(), ) await drone.run() async with Scope() as scope: From efe2d88780016655313799920131be6dd03c0189 Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Wed, 27 Nov 2019 10:24:41 +0100 Subject: [PATCH 390/648] fixed job and simulator unit tests --- lapis_tests/__init__.py | 2 +- lapis_tests/test_simulator.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/lapis_tests/__init__.py b/lapis_tests/__init__.py index 722b3a2..00c6830 100644 --- a/lapis_tests/__init__.py +++ b/lapis_tests/__init__.py @@ -46,4 +46,4 @@ def update_drone(drone: Drone): class DummyDrone: - pass + connection = None diff --git a/lapis_tests/test_simulator.py b/lapis_tests/test_simulator.py index 4875d6b..0d51d42 100644 --- a/lapis_tests/test_simulator.py +++ b/lapis_tests/test_simulator.py @@ -31,6 +31,7 @@ def test_simulation_exit(self): job_input=job_input, job_reader=htcondor_job_reader ) simulator.create_scheduler(scheduler_type=CondorJobScheduler) + simulator.create_connection_module() simulator.create_pools( pool_input=machine_input, pool_reader=htcondor_pool_reader, From b531f562b6bcb2d64aa82bcb1e92732d59d5e1e1 Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Wed, 27 Nov 2019 10:28:55 +0100 Subject: [PATCH 391/648] fixed job and simulator unit tests --- lapis/job.py | 40 ++++++++++++++++++++++++++++------------ 1 file changed, 28 insertions(+), 12 deletions(-) diff --git a/lapis/job.py b/lapis/job.py index 331bc4e..2d0b01e 100644 --- a/lapis/job.py +++ b/lapis/job.py @@ -5,7 +5,6 @@ from usim import CancelTask from lapis.monitor import sampling_required -from lapis.utilities.walltime_models import walltime_models if TYPE_CHECKING: from lapis.drone import Drone @@ -63,7 +62,7 @@ def __init__( ) self.resources[key] = self.used_resources[key] self._walltime = used_resources.pop("walltime") - self._calculationtime = walltime_models["maxeff"](self, self._walltime) + self._calculationtime = self.get_calculation_time() self._streamtime = 0 self.requested_walltime = resources.pop("walltime", None) self.requested_inputfiles = resources.pop("inputfiles", None) @@ -108,6 +107,21 @@ def calculation_time(self): print("WALLTIME: Job {} @ {}".format(repr(self), time.now)) return self._calculationtime + def get_calculation_time(self, calculation_efficiency=0.9): + """ + Determines a jobs calculation time based on the jobs CPU time and a + calculation efficiency representing inefficient programming. + :param calculation_efficiency: + :return: + """ + try: + return ( + self.used_resources["cores"] / calculation_efficiency + ) * self._walltime + except KeyError: + # logging.getLogger("implementation").info() + return self._walltime + async def transfer_inputfiles(self): print("TRANSFERING INPUTFILES: Job {} @ {}".format(repr(self), time.now)) if self.drone.connection and self.used_inputfiles: @@ -124,20 +138,22 @@ async def transfer_inputfiles(self): ) ) - async def run(self): + async def run(self, drone: "Drone"): + assert drone, "Jobs cannot run without a drone being assigned" + self.drone = drone self.in_queue_until = time.now self._success = None await sampling_required.put(self) - if self.drone: - print( - "running job {} on site {} in drone {}".format( - repr(self), self.drone.sitename, repr(self.drone) - ) - ) + print("running job {} in drone {}".format(repr(self), repr(self.drone))) try: - - await self.transfer_inputfiles() - await (time + self.calculation_time) + if self.requested_inputfiles: + await self.transfer_inputfiles() + await (time + self.calculation_time) + else: + # ToDo: improve handling of jobs without inputfiles (correct value in + # self.walltime and therefore in monitoring etc) + await (time + self._walltime) + print(self.calculation_time, self._streamtime, self.walltime) except CancelTask: self.drone = None self._success = False From 980ac971c3b4e19b3bea9ca2103d3bd52d8edc94 Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Wed, 27 Nov 2019 10:30:07 +0100 Subject: [PATCH 392/648] replaced Storage.__repr__ to match the other classes --- lapis/storage.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/lapis/storage.py b/lapis/storage.py index 3ca78bc..ebc9678 100644 --- a/lapis/storage.py +++ b/lapis/storage.py @@ -196,11 +196,3 @@ async def look_up_file(self, requested_file: RequestedFile, queue: Queue, job_re def __repr__(self): return "<%s: %s>" % (self.__class__.__name__, self.name or id(self)) - - # return "{name} on site {site}: {used}MB of {tot}MB used ({div} %)".format( - # name=self.name, - # site=self.sitename, - # used=self.usedstorage, - # tot=self.storagesize, - # div=100.0 * self.usedstorage / self.storagesize, - # ) From f9599026efac52c4d51fba1951146660bbb648f2 Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Wed, 27 Nov 2019 10:34:44 +0100 Subject: [PATCH 393/648] added missing default values for unit test compatibility --- lapis/simulator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapis/simulator.py b/lapis/simulator.py index f216a38..3b12748 100644 --- a/lapis/simulator.py +++ b/lapis/simulator.py @@ -83,7 +83,7 @@ def create_storage(self, storage_input, storage_content_input, storage_reader): def create_scheduler(self, scheduler_type): self.job_scheduler = scheduler_type(job_queue=self.job_queue) - def create_connection_module(self, remote_throughput, cache_hitrate=None): + def create_connection_module(self, remote_throughput=100, cache_hitrate=None): self.connection = Connection(remote_throughput, cache_hitrate) def run(self, until=None): From bf3a68bcb1a159011c4c18c68fd19ff6b45528d9 Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Wed, 27 Nov 2019 10:35:25 +0100 Subject: [PATCH 394/648] extended monitoring --- lapis/connection.py | 1 + lapis/monitor/general.py | 6 +++--- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/lapis/connection.py b/lapis/connection.py index 7adc635..c548190 100644 --- a/lapis/connection.py +++ b/lapis/connection.py @@ -165,6 +165,7 @@ async def transfer_wrapper(self, connection, total): total, connection.throughput, time.now ) ) + await sampling_required.put(connection) await connection.transfer(total=total) print( "transfering {} with {}, stop @ {}".format( diff --git a/lapis/monitor/general.py b/lapis/monitor/general.py index eb8b8a0..51064ee 100644 --- a/lapis/monitor/general.py +++ b/lapis/monitor/general.py @@ -243,7 +243,7 @@ def storage_status(storage: Storage) -> list: LoggingSocketHandler.__name__: JsonFormatter(), logging.StreamHandler.__name__: JsonFormatter(), LoggingUDPSocketHandler.__name__: LineProtocolFormatter( - tags={"tardis", "storage_status"}, resolution=1 + tags={"tardis", "storage"}, resolution=1 ), } @@ -271,7 +271,7 @@ def storage_connection(storage: Storage) -> list: LoggingSocketHandler.__name__: JsonFormatter(), logging.StreamHandler.__name__: JsonFormatter(), LoggingUDPSocketHandler.__name__: LineProtocolFormatter( - tags={"tardis", "storage_connection"}, resolution=1 + tags={"tardis", "storage"}, resolution=1 ), } @@ -298,6 +298,6 @@ def remote_connection(remote: Pipe) -> list: LoggingSocketHandler.__name__: JsonFormatter(), logging.StreamHandler.__name__: JsonFormatter(), LoggingUDPSocketHandler.__name__: LineProtocolFormatter( - tags={"tardis", "storage_connection"}, resolution=1 + tags={"tardis"}, resolution=1 ), } From 205818b30dbf50cb6b9460adeae3a25bb59bc603 Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Wed, 27 Nov 2019 10:41:59 +0100 Subject: [PATCH 395/648] fixed PEP8 issue --- lapis/job.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lapis/job.py b/lapis/job.py index 2d0b01e..b58bf46 100644 --- a/lapis/job.py +++ b/lapis/job.py @@ -130,7 +130,8 @@ async def transfer_inputfiles(self): ) print( - "streamed inputfiles {} for job {} in {} timeunits, finished @ {}".format( + "streamed inputfiles {} for job {} in {} timeunits, finished @ {}" + "".format( self.requested_inputfiles.keys(), repr(self), self._streamtime, From b169f22994d9fcb6437c112894b601d380d2026f Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Wed, 27 Nov 2019 16:33:28 +0100 Subject: [PATCH 396/648] updated via_usim decorator --- lapis_tests/__init__.py | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/lapis_tests/__init__.py b/lapis_tests/__init__.py index 00c6830..1194ae4 100644 --- a/lapis_tests/__init__.py +++ b/lapis_tests/__init__.py @@ -6,6 +6,16 @@ from lapis.drone import Drone +class UnfinishedTest(RuntimeError): + """A test did never finish""" + + def __init__(self, test_case): + self.test_case = test_case + super().__init__( + "Test case %r did not finish" % getattr(test_case, "__name__", test_case) + ) + + def via_usim(test_case: Callable[..., Coroutine]): """ Mark an ``async def`` test case to be run via ``usim.run`` @@ -22,11 +32,16 @@ async def test_sleep(): @wraps(test_case) def run_test(*args, **kwargs): - # pytest currently ignores __tracebackhide__ if we re-raise - # https://github.com/pytest-dev/pytest/issues/1904 - __tracebackhide__ = True - # >>> This is not the frame you are looking for. Do read on. <<< - return run(test_case(*args, **kwargs)) + test_completed = False + + async def complete_test_case(): + nonlocal test_completed + await test_case(*args, **kwargs) + test_completed = True + + run(complete_test_case()) + if not test_completed: + raise UnfinishedTest(test_case) return run_test From 77a0542f3f6d3917239d35cd3a787a9b4ee24ffc Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Wed, 27 Nov 2019 17:22:29 +0100 Subject: [PATCH 397/648] added statistics about jobs in DummyScheduler --- lapis_tests/__init__.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/lapis_tests/__init__.py b/lapis_tests/__init__.py index 1194ae4..f749a6b 100644 --- a/lapis_tests/__init__.py +++ b/lapis_tests/__init__.py @@ -1,9 +1,10 @@ from typing import Callable, Coroutine from functools import wraps -from usim import run +from usim import run, Resources from lapis.drone import Drone +from lapis.job import Job class UnfinishedTest(RuntimeError): @@ -47,6 +48,9 @@ async def complete_test_case(): class DummyScheduler: + def __init__(self): + self.statistics = Resources(job_succeeded=0, job_failed=0) + @staticmethod def register_drone(drone: Drone): pass @@ -59,6 +63,12 @@ def unregister_drone(drone: Drone): def update_drone(drone: Drone): pass + async def job_finished(self, job: Job): + if job.successful: + await self.statistics.increase(job_succeeded=1) + else: + await self.statistics.increase(job_failed=1) + class DummyDrone: connection = None From 30e4753f2aca64fd9fca6548bbd84cbbfe271d40 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Wed, 27 Nov 2019 17:23:10 +0100 Subject: [PATCH 398/648] made unit tests succeed again --- lapis_tests/test_job.py | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/lapis_tests/test_job.py b/lapis_tests/test_job.py index 4fd96ae..29c8930 100644 --- a/lapis_tests/test_job.py +++ b/lapis_tests/test_job.py @@ -51,9 +51,13 @@ async def test_job_in_drone(self): scheduling_duration=0, connection=Connection(), ) - await drone.run() async with Scope() as scope: + scope.do(drone.run(), volatile=True) scope.do(drone.schedule_job(job=job)) + await ( + scheduler.statistics._available + == scheduler.statistics.resource_type(job_succeeded=1) + ) assert 10 == time.now assert 0 == job.waiting_time assert job.successful @@ -70,9 +74,13 @@ async def test_nonmatching_job_in_drone(self): pool_resources={"cores": 1, "memory": 1}, scheduling_duration=0, ) - await drone.run() async with Scope() as scope: + scope.do(drone.run(), volatile=True) scope.do(drone.schedule_job(job=job)) + await ( + scheduler.statistics._available + == scheduler.statistics.resource_type(job_failed=1) + ) assert 0 == time assert not job.successful assert 0 == job.waiting_time @@ -93,10 +101,14 @@ async def test_two_nonmatching_jobs(self): pool_resources={"cores": 1, "memory": 1}, scheduling_duration=0, ) - await drone.run() async with Scope() as scope: + scope.do(drone.run(), volatile=True) scope.do(drone.schedule_job(job=job_one)) scope.do(drone.schedule_job(job=job_two)) + await ( + scheduler.statistics._available + == scheduler.statistics.resource_type(job_succeeded=1, job_failed=1) + ) assert 10 == time assert job_one.successful assert not job_two.successful @@ -119,10 +131,14 @@ async def test_two_matching_jobs(self): pool_resources={"cores": 2, "memory": 2}, scheduling_duration=0, ) - await drone.run() async with Scope() as scope: + scope.do(drone.run(), volatile=True) scope.do(drone.schedule_job(job=job_one)) scope.do(drone.schedule_job(job=job_two)) + await ( + scheduler.statistics._available + == scheduler.statistics.resource_type(job_succeeded=2) + ) assert 10 == time assert job_one.successful assert job_two.successful From ee7ff50bb99b7d08c21c34b4374f92e70e73c24b Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Wed, 27 Nov 2019 17:24:22 +0100 Subject: [PATCH 399/648] made enabling of monitoring explicit --- lapis/cli/simulate.py | 3 +++ lapis/simulator.py | 4 +--- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/lapis/cli/simulate.py b/lapis/cli/simulate.py index 6057c49..9360d72 100644 --- a/lapis/cli/simulate.py +++ b/lapis/cli/simulate.py @@ -109,6 +109,7 @@ def static(ctx, job_file, pool_file, storage_files, remote_throughput, cache_hit storage_content_input=storage_content_file, storage_reader=storage_import_mapper[storage_type], ) + simulator.enable_monitoring() simulator.run(until=ctx.obj["until"]) @@ -141,6 +142,7 @@ def dynamic(ctx, job_file, pool_file): pool_type=Pool, controller=SimulatedLinearController, ) + simulator.enable_monitoring() simulator.run(until=ctx.obj["until"]) @@ -186,6 +188,7 @@ def hybrid(ctx, job_file, static_pool_file, dynamic_pool_file): pool_type=Pool, controller=SimulatedLinearController, ) + simulator.enable_monitoring() simulator.run(until=ctx.obj["until"]) diff --git a/lapis/simulator.py b/lapis/simulator.py index 3b12748..7521c75 100644 --- a/lapis/simulator.py +++ b/lapis/simulator.py @@ -36,12 +36,10 @@ def __init__(self, seed=1234): self.job_generator = None self.cost = 0 self._job_generators = [] - self.monitoring = None + self.monitoring = Monitoring() self.duration = None - self.enable_monitoring() def enable_monitoring(self): - self.monitoring = Monitoring() self.monitoring.register_statistic(user_demand) self.monitoring.register_statistic(job_statistics) self.monitoring.register_statistic(job_events) From 3e787e207519749ecd709e0e3aa89569cae37d00 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Wed, 27 Nov 2019 17:31:18 +0100 Subject: [PATCH 400/648] blackened file --- lapis/utilities/walltime_models.py | 1 + 1 file changed, 1 insertion(+) diff --git a/lapis/utilities/walltime_models.py b/lapis/utilities/walltime_models.py index 7484b6a..1a68ed9 100644 --- a/lapis/utilities/walltime_models.py +++ b/lapis/utilities/walltime_models.py @@ -1,3 +1,4 @@ +# walltime models for caching def extrapolate_walltime_to_maximal_efficiency( From 04a6f30bda285aabc89baa008437787cd8808fcd Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Wed, 27 Nov 2019 18:34:00 +0100 Subject: [PATCH 401/648] changed cli to also start without any caching information --- lapis/cli/simulate.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/lapis/cli/simulate.py b/lapis/cli/simulate.py index 9360d72..aebb530 100644 --- a/lapis/cli/simulate.py +++ b/lapis/cli/simulate.py @@ -82,6 +82,7 @@ def cli(ctx, seed, until, log_tcp, log_file, log_telegraf): click.File("r"), click.Choice(list(storage_import_mapper.keys())), ), + default=(None, None, None), ) @click.option("--remote-throughput", "remote_throughput", type=float, default=10) @click.option("--cache-hitrate", "cache_hitrate", type=float, default=None) @@ -94,8 +95,15 @@ def static(ctx, job_file, pool_file, storage_files, remote_throughput, cache_hit job_input=file, job_reader=job_import_mapper[file_type] ) simulator.create_scheduler(scheduler_type=CondorJobScheduler) - simulator.create_connection_module(remote_throughput, cache_hitrate) + if all(storage_files): + simulator.create_connection_module(remote_throughput, cache_hitrate) + storage_file, storage_content_file, storage_type = storage_files + simulator.create_storage( + storage_input=storage_file, + storage_content_input=storage_content_file, + storage_reader=storage_import_mapper[storage_type], + ) for current_pool in pool_file: pool_file, pool_file_type = current_pool simulator.create_pools( @@ -103,12 +111,6 @@ def static(ctx, job_file, pool_file, storage_files, remote_throughput, cache_hit pool_reader=pool_import_mapper[pool_file_type], pool_type=StaticPool, ) - storage_file, storage_content_file, storage_type = storage_files - simulator.create_storage( - storage_input=storage_file, - storage_content_input=storage_content_file, - storage_reader=storage_import_mapper[storage_type], - ) simulator.enable_monitoring() simulator.run(until=ctx.obj["until"]) From 13e9e94b0bcc01a4a99526388bc3f05b6b339e33 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Wed, 27 Nov 2019 18:35:05 +0100 Subject: [PATCH 402/648] changed assignment of connections a bit --- lapis/drone.py | 7 +++++-- lapis/pool.py | 18 +++++++++++++++--- lapis/pool_io/htcondor.py | 4 ++++ lapis/simulator.py | 14 +++++--------- 4 files changed, 29 insertions(+), 14 deletions(-) diff --git a/lapis/drone.py b/lapis/drone.py index ac5160d..7217b32 100644 --- a/lapis/drone.py +++ b/lapis/drone.py @@ -15,11 +15,11 @@ class Drone(interfaces.Pool): def __init__( self, scheduler, - connection: Connection = Connection(), pool_resources: Optional[dict] = None, scheduling_duration: Optional[float] = None, ignore_resources: list = None, sitename: str = None, + connection: Connection = None, ): """ :param scheduler: @@ -28,7 +28,10 @@ def __init__( """ super(Drone, self).__init__() self.scheduler = scheduler - self.connection = connection + if connection is not None: + self.connection = connection + else: + self.connection = Connection() self.sitename = sitename self.pool_resources = pool_resources self.resources = Capacities(**pool_resources) diff --git a/lapis/pool.py b/lapis/pool.py index 2a3e465..74bbc9b 100644 --- a/lapis/pool.py +++ b/lapis/pool.py @@ -1,7 +1,9 @@ +from functools import partial from typing import Generator, Callable from cobald import interfaces from usim import eternity, Scope, interval +from lapis.connection import Connection from .drone import Drone @@ -24,15 +26,20 @@ def __init__( capacity: int = float("inf"), init: int = 0, name: str = None, + connection: Connection = None, ): super(Pool, self).__init__() assert init <= capacity - self.make_drone = make_drone self._drones = [] self._demand = 1 self._level = init self._capacity = capacity self._name = name + # TODO: Should drones have access to the pool or the connection directly? + if connection is not None: + self.make_drone = partial(make_drone, connection=connection) + else: + self.make_drone = make_drone async def init_pool(self, scope: Scope, init: int = 0): """ @@ -136,10 +143,15 @@ class StaticPool(Pool): instantiated within the pool """ - def __init__(self, make_drone: Callable, capacity: int = 0): + def __init__( + self, make_drone: Callable, capacity: int = 0, connection: Connection = None + ): assert capacity > 0, "Static pool was initialised without any resources..." super(StaticPool, self).__init__( - capacity=capacity, init=capacity, make_drone=make_drone + capacity=capacity, + init=capacity, + make_drone=make_drone, + connection=connection, ) self._demand = capacity diff --git a/lapis/pool_io/htcondor.py b/lapis/pool_io/htcondor.py index 84fd948..d60e92e 100644 --- a/lapis/pool_io/htcondor.py +++ b/lapis/pool_io/htcondor.py @@ -2,6 +2,8 @@ from functools import partial from typing import Callable + +from lapis.connection import Connection from ..pool import Pool @@ -19,6 +21,7 @@ def htcondor_pool_reader( }, pool_type: Callable = Pool, make_drone: Callable = None, + connection: Connection = None, ): """ Load a pool configuration that was exported via htcondor from files or @@ -50,4 +53,5 @@ def htcondor_pool_reader( ignore_resources=["disk"], sitename=row.get("sitename", None), ), + connection=connection, ) diff --git a/lapis/simulator.py b/lapis/simulator.py index 7521c75..18951f9 100644 --- a/lapis/simulator.py +++ b/lapis/simulator.py @@ -30,7 +30,7 @@ def __init__(self, seed=1234): random.seed(seed) self.job_queue = Queue() self.pools = [] - self.connection: Connection + self.connection: Connection = None self.controllers = [] self.job_scheduler = None self.job_generator = None @@ -57,22 +57,18 @@ def create_job_generator(self, job_input, job_reader): def create_pools(self, pool_input, pool_reader, pool_type, controller=None): assert self.job_scheduler, "Scheduler needs to be created before pools" - assert self.connection, ( - "Connection module needs to be created before " "storages" - ) for pool in pool_reader( iterable=pool_input, pool_type=pool_type, - make_drone=partial(Drone, self.job_scheduler, self.connection), + make_drone=partial(Drone, self.job_scheduler), + connection=self.connection, ): self.pools.append(pool) if controller: self.controllers.append(controller(target=pool, rate=1)) def create_storage(self, storage_input, storage_content_input, storage_reader): - assert self.connection, ( - "Connection module needs to be created before " "storages" - ) + assert self.connection, "Connection module needs to be created before storages" for storage in storage_reader( storage=storage_input, storage_content=storage_content_input ): @@ -81,7 +77,7 @@ def create_storage(self, storage_input, storage_content_input, storage_reader): def create_scheduler(self, scheduler_type): self.job_scheduler = scheduler_type(job_queue=self.job_queue) - def create_connection_module(self, remote_throughput=100, cache_hitrate=None): + def create_connection_module(self, remote_throughput, cache_hitrate): self.connection = Connection(remote_throughput, cache_hitrate) def run(self, until=None): From e1deb6caaf4fe8df7201f86dd1e74a6bccae7b79 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Wed, 27 Nov 2019 18:36:11 +0100 Subject: [PATCH 403/648] removed creation of connection module from test as it is not required --- lapis_tests/test_simulator.py | 1 - 1 file changed, 1 deletion(-) diff --git a/lapis_tests/test_simulator.py b/lapis_tests/test_simulator.py index 0d51d42..4875d6b 100644 --- a/lapis_tests/test_simulator.py +++ b/lapis_tests/test_simulator.py @@ -31,7 +31,6 @@ def test_simulation_exit(self): job_input=job_input, job_reader=htcondor_job_reader ) simulator.create_scheduler(scheduler_type=CondorJobScheduler) - simulator.create_connection_module() simulator.create_pools( pool_input=machine_input, pool_reader=htcondor_pool_reader, From e5a54c78b490548ff3dd9e10d21b4efd8de1ffce Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Wed, 27 Nov 2019 20:14:29 +0100 Subject: [PATCH 404/648] converted storage and file sizes to bytes --- lapis/storage_io/storage.py | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/lapis/storage_io/storage.py b/lapis/storage_io/storage.py index 5be532e..19a8bd3 100644 --- a/lapis/storage_io/storage.py +++ b/lapis/storage_io/storage.py @@ -2,20 +2,35 @@ from lapis.storage import Storage -def storage_reader(storage, storage_content): +def storage_reader( + storage, + storage_content, + unit_conversion_mapping: dict = { + "cachesizeGB": 1024 * 1024 * 1024, + "throughput_limit": 1024 * 1024 * 1024, + }, # noqa: B006 +): storage_content = storage_content_reader(storage_content) reader = csv.DictReader(storage, delimiter=" ", quotechar="'") for row in reader: yield Storage( name=row["name"], sitename=row["sitename"], - storagesize=int(row["cachesizeGB"]), + storagesize=int( + row["cachesizeGB"] * unit_conversion_mapping.get("cachesizeGB", 1) + ), throughput_limit=int(row["throughput_limit"]), files=storage_content[row["name"]], ) -def storage_content_reader(file_name): +def storage_content_reader( + file_name, + unit_conversion_mapping: dict = { + "filesize": 1024 * 1024 * 1024, + "usedsize": 1024 * 1024 * 1024, + }, +): reader = csv.DictReader(file_name, delimiter=" ", quotechar="'") cache_information = dict() for row in reader: @@ -29,7 +44,9 @@ def storage_content_reader(file_name): "lastaccessed", "numberofaccesses", ]: - cache_information[row["cachename"]][row["filename"]][key] = int(row[key]) + cache_information[row["cachename"]][row["filename"]][key] = int( + row[key] * unit_conversion_mapping.get(key, 1) + ) if not cache_information: cache_information = None return cache_information From ccdebfcefb6b02e5aaddec885bce67dbf7b7dbf6 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Wed, 27 Nov 2019 20:30:18 +0100 Subject: [PATCH 405/648] corrected access to numberofaccesses --- lapis/files.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapis/files.py b/lapis/files.py index 9158674..0469c4a 100644 --- a/lapis/files.py +++ b/lapis/files.py @@ -11,7 +11,7 @@ def __init__(self, filename, filespecs): self.numberofaccesses: int = filespecs.get("numberofaccesses", 0) def increment_accesses(self): - self._numberofaccesses += 1 + self.numberofaccesses += 1 class RequestedFile(NamedTuple): From 19c23684066181b12fa9ee702a9bf6a11939bf8c Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Wed, 27 Nov 2019 21:26:37 +0100 Subject: [PATCH 406/648] changed signature of StoredFile and adapted in IO operations --- lapis/files.py | 35 +++++++++++++++++++++++++++-------- lapis/storage.py | 5 +---- lapis/storage_io/storage.py | 22 ++++++++-------------- 3 files changed, 36 insertions(+), 26 deletions(-) diff --git a/lapis/files.py b/lapis/files.py index 0469c4a..b7871aa 100644 --- a/lapis/files.py +++ b/lapis/files.py @@ -2,13 +2,32 @@ class StoredFile(object): - def __init__(self, filename, filespecs): + + __slots__ = ( + "filename", + "filesize", + "storedsize", + "cachedsince", + "lastaccessed", + "numberofaccesses", + ) + + def __init__( + self, + filename: str, + filesize: Optional[int] = None, + storedsize: Optional[int] = None, + cachedsince: Optional[int] = None, + lastaccessed: Optional[int] = None, + numberofaccesses: Optional[int] = None, + **filespecs, + ): self.filename = filename - self.filesize: Optional[int] = filespecs.get("filesize", None) - self.storedsize: Optional[int] = filespecs.get("storedsize", self.filesize) - self.cachedsince: Optional[int] = filespecs.get("cachedsince", None) - self.lastaccessed: Optional[int] = filespecs.get("lastaccessed", None) - self.numberofaccesses: int = filespecs.get("numberofaccesses", 0) + self.filesize = filesize + self.storedsize = storedsize or self.filesize + self.cachedsince = cachedsince + self.lastaccessed = lastaccessed + self.numberofaccesses = numberofaccesses def increment_accesses(self): self.numberofaccesses += 1 @@ -20,10 +39,10 @@ class RequestedFile(NamedTuple): def convert_to_stored_file_object(self, currenttime): print(self.filesize) - filespecs = dict( + return StoredFile( + self.filename, filesize=self.filesize, cachedsince=currenttime, lastaccessed=currenttime, numberofaccesses=1, ) - return StoredFile(self.filename, filespecs) diff --git a/lapis/storage.py b/lapis/storage.py index ebc9678..691bcbd 100644 --- a/lapis/storage.py +++ b/lapis/storage.py @@ -39,10 +39,7 @@ def __init__( self.deletion_duration = 5 self.update_duration = 1 self.storagesize = storagesize - self.files = { - filename: StoredFile(filename, filespecs) - for filename, filespecs in files.items() - } + self.files = files self._usedstorage = Resources( usedsize=sum(file.filesize for file in list(self.files.values())) ) diff --git a/lapis/storage_io/storage.py b/lapis/storage_io/storage.py index 19a8bd3..b59ef37 100644 --- a/lapis/storage_io/storage.py +++ b/lapis/storage_io/storage.py @@ -1,4 +1,6 @@ import csv + +from lapis.files import StoredFile from lapis.storage import Storage @@ -34,19 +36,11 @@ def storage_content_reader( reader = csv.DictReader(file_name, delimiter=" ", quotechar="'") cache_information = dict() for row in reader: - if row["cachename"] not in cache_information.keys(): - cache_information[row["cachename"]] = dict() - cache_information[row["cachename"]][row["filename"]] = dict() - for key in [ - "filesize", - "usedsize", - "cachedsince", - "lastaccessed", - "numberofaccesses", - ]: - cache_information[row["cachename"]][row["filename"]][key] = int( - row[key] * unit_conversion_mapping.get(key, 1) - ) + for key in row: + row[key] = row[key] * unit_conversion_mapping.get(key, 1) + cache_information.setdefault(row["cachename"], {})[ + row["filename"] + ] = StoredFile(row["filename"], **row) if not cache_information: - cache_information = None + return None return cache_information From 0cf879a148c923d055a421f52936732603311f0c Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Wed, 27 Nov 2019 21:27:58 +0100 Subject: [PATCH 407/648] improved storage --- lapis/storage.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/lapis/storage.py b/lapis/storage.py index 691bcbd..6bc6ecf 100644 --- a/lapis/storage.py +++ b/lapis/storage.py @@ -28,8 +28,8 @@ class Storage(object): def __init__( self, - name: str = None, - sitename: str = None, + name: Optional[str] = None, + sitename: Optional[str] = None, storagesize: int = 1000, throughput_limit: int = 10, files: Optional[dict] = None, @@ -41,7 +41,7 @@ def __init__( self.storagesize = storagesize self.files = files self._usedstorage = Resources( - usedsize=sum(file.filesize for file in list(self.files.values())) + size=sum(file.filesize for file in files.values()) ) self.cachealgorithm = CacheAlgorithm(self) self.connection = Pipe(throughput_limit) @@ -49,7 +49,7 @@ def __init__( @property def usedstorage(self): - return self._usedstorage.levels.usedsize + return self._usedstorage.levels.size def free_space(self): return self.storagesize - self.usedstorage @@ -60,10 +60,7 @@ def find_file(self, filename: str) -> StoredFile: :param filename: :return: corresponding file object """ - try: - return self.files[filename] - except KeyError: - raise KeyError + return self.files[filename] async def remove_from_storage(self, file: StoredFile, job_repr): """ From 560d5512c981e555f410bd419c0e9c15fd0859af Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Wed, 27 Nov 2019 22:12:20 +0100 Subject: [PATCH 408/648] minimum required usim version set to 0.4.3 --- pyproject.toml | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 675c1b7..6f34b1c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -24,12 +24,8 @@ classifiers = [ 'Programming Language :: Python :: 3.6', 'Programming Language :: Python :: 3.7' ] -requires = [ - "cobald", - "usim == 0.4.2", - "click", -] -requires = ["cobald", "usim == 0.4", "click"] + +requires = ["cobald", "usim >= 0.4.3", "click"] [tool.flit.metadata.requires-extra] test = [ From 4955813dc480d9070240e9caf07ee2d121dc8652 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Wed, 27 Nov 2019 22:15:38 +0100 Subject: [PATCH 409/648] renamed remove_from_storage and add_to_storage to remove and add --- lapis/storage.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/lapis/storage.py b/lapis/storage.py index 6bc6ecf..8bc8efb 100644 --- a/lapis/storage.py +++ b/lapis/storage.py @@ -62,7 +62,7 @@ def find_file(self, filename: str) -> StoredFile: """ return self.files[filename] - async def remove_from_storage(self, file: StoredFile, job_repr): + async def remove(self, file: StoredFile, job_repr): """ Deletes file from storage object. The time this operation takes is defined by the storages deletion_duration attribute. @@ -79,7 +79,7 @@ async def remove_from_storage(self, file: StoredFile, job_repr): await self._usedstorage.decrease(usedsize=file.filesize) self.files.pop(file.filename) - async def add_to_storage(self, file: RequestedFile, job_repr): + async def add(self, file: RequestedFile, job_repr): """ Adds file to storage object transfering it through the storage objects connection. This should be sufficient for now because files are only added @@ -147,7 +147,7 @@ async def apply_caching_decision(self, requested_file: RequestedFile, job_repr): ) to_be_removed = self.cachealgorithm.consider(requested_file) if not to_be_removed: - await self.add_to_storage(requested_file, job_repr) + await self.add(requested_file, job_repr) elif to_be_removed == {requested_file}: # file will not be cached because it either does not match # conditions or because there is no space in the cache @@ -157,7 +157,7 @@ async def apply_caching_decision(self, requested_file: RequestedFile, job_repr): ) else: for file in to_be_removed: - await self.remove_from_storage(file, job_repr) + await self.remove(file, job_repr) async def look_up_file(self, requested_file: RequestedFile, queue: Queue, job_repr): """ From bb0a99b3a11e67fe303400d043742c9c83183359 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Wed, 27 Nov 2019 22:17:20 +0100 Subject: [PATCH 410/648] made free_space a property of storage --- lapis/cachealgorithm.py | 4 ++-- lapis/storage.py | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/lapis/cachealgorithm.py b/lapis/cachealgorithm.py index 6f5048b..0f4bb34 100644 --- a/lapis/cachealgorithm.py +++ b/lapis/cachealgorithm.py @@ -37,7 +37,7 @@ def _context_based_consideration(self, candidate: RequestedFile): """ to_be_removed = set() sorted_stored_files = sort_files_by_cachedsince(self._storage.files) - current_free_storage = self._storage.free_space() + current_free_storage = self._storage.free_space for stored_file in sorted_stored_files: if stored_file.numberofaccesses < 3: to_be_removed.add(stored_file) @@ -59,7 +59,7 @@ def consider(self, candidate: RequestedFile) -> Optional[Set[RequestedFile]]: :return: """ if self._file_based_consideration(candidate): - if self._storage.free_space() < candidate.filesize: + if self._storage.free_space < candidate.filesize: return self._context_based_consideration(candidate) else: return set() diff --git a/lapis/storage.py b/lapis/storage.py index 8bc8efb..dd9b39e 100644 --- a/lapis/storage.py +++ b/lapis/storage.py @@ -51,6 +51,7 @@ def __init__( def usedstorage(self): return self._usedstorage.levels.size + @property def free_space(self): return self.storagesize - self.usedstorage From bd15741e4a4a391bea57a792a080a637fd1b50fd Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Wed, 27 Nov 2019 22:23:14 +0100 Subject: [PATCH 411/648] removed method find_file from storage --- lapis/storage.py | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/lapis/storage.py b/lapis/storage.py index dd9b39e..03cb2a3 100644 --- a/lapis/storage.py +++ b/lapis/storage.py @@ -55,14 +55,6 @@ def usedstorage(self): def free_space(self): return self.storagesize - self.usedstorage - def find_file(self, filename: str) -> StoredFile: - """ - Searches storage object for file with passed filename - :param filename: - :return: corresponding file object - """ - return self.files[filename] - async def remove(self, file: StoredFile, job_repr): """ Deletes file from storage object. The time this operation takes is defined @@ -128,7 +120,7 @@ async def transfer(self, file, job_repr): """ await self.connection.transfer(file.filesize) try: - await self.update_file(self.find_file(file.filename), job_repr) + await self.update_file(self.files[file.filename], job_repr) except AttributeError: pass @@ -177,9 +169,7 @@ async def look_up_file(self, requested_file: RequestedFile, queue: Queue, job_re try: await queue.put( - LookUpInformation( - self.find_file(requested_file.filename).filesize, self - ) + LookUpInformation(self.files[requested_file.filename].filesize, self) ) except KeyError: print( From 26e5b5241cebb7ccf91995f9ca3a57f21494b45e Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Wed, 27 Nov 2019 22:24:22 +0100 Subject: [PATCH 412/648] added todo --- lapis/storage.py | 1 + 1 file changed, 1 insertion(+) diff --git a/lapis/storage.py b/lapis/storage.py index 03cb2a3..312e130 100644 --- a/lapis/storage.py +++ b/lapis/storage.py @@ -120,6 +120,7 @@ async def transfer(self, file, job_repr): """ await self.connection.transfer(file.filesize) try: + # TODO: needs handling of KeyError await self.update_file(self.files[file.filename], job_repr) except AttributeError: pass From 06639e4d8d0802d5f99b07c624a5caa7c124df7a Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Wed, 27 Nov 2019 22:28:09 +0100 Subject: [PATCH 413/648] ignored B006 for flake8 --- lapis/storage_io/storage.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapis/storage_io/storage.py b/lapis/storage_io/storage.py index b59ef37..a421ed1 100644 --- a/lapis/storage_io/storage.py +++ b/lapis/storage_io/storage.py @@ -31,7 +31,7 @@ def storage_content_reader( unit_conversion_mapping: dict = { "filesize": 1024 * 1024 * 1024, "usedsize": 1024 * 1024 * 1024, - }, + }, # noqa: B006 ): reader = csv.DictReader(file_name, delimiter=" ", quotechar="'") cache_information = dict() From 0a885e6b1fe77b226d33876fb2d2ee245c4540f8 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Wed, 27 Nov 2019 22:46:43 +0100 Subject: [PATCH 414/648] if file is available on storage, transfer now receives correct size --- lapis/connection.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapis/connection.py b/lapis/connection.py index c548190..51a7b43 100644 --- a/lapis/connection.py +++ b/lapis/connection.py @@ -92,7 +92,7 @@ async def stream_file(self, requested_file: RequestedFile, dronesite, job_repr): else: await sampling_required.put(used_connection) print("now transfering", requested_file.filesize) - await used_connection.transfer(requested_file, job_repr) + await used_connection.transfer(requested_file.filesize, job_repr) print( "Job {}: finished transfering of file {}: {}GB @ {}".format( job_repr, requested_file.filename, requested_file.filesize, time.now From 1f782dc1c870ed0fc08c6c1c1add04532d43baa3 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Wed, 27 Nov 2019 22:49:21 +0100 Subject: [PATCH 415/648] fixed position of noqa --- lapis/storage_io/storage.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/lapis/storage_io/storage.py b/lapis/storage_io/storage.py index a421ed1..1e0dabb 100644 --- a/lapis/storage_io/storage.py +++ b/lapis/storage_io/storage.py @@ -7,10 +7,10 @@ def storage_reader( storage, storage_content, - unit_conversion_mapping: dict = { + unit_conversion_mapping: dict = { # noqa: B006 "cachesizeGB": 1024 * 1024 * 1024, "throughput_limit": 1024 * 1024 * 1024, - }, # noqa: B006 + }, ): storage_content = storage_content_reader(storage_content) reader = csv.DictReader(storage, delimiter=" ", quotechar="'") @@ -28,10 +28,10 @@ def storage_reader( def storage_content_reader( file_name, - unit_conversion_mapping: dict = { + unit_conversion_mapping: dict = { # noqa: B006 "filesize": 1024 * 1024 * 1024, "usedsize": 1024 * 1024 * 1024, - }, # noqa: B006 + }, ): reader = csv.DictReader(file_name, delimiter=" ", quotechar="'") cache_information = dict() From 62579c2c842b39c2d615385c7e236cf8e5342589 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Wed, 27 Nov 2019 22:49:59 +0100 Subject: [PATCH 416/648] made determine_inputfile_source private to connection --- lapis/connection.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lapis/connection.py b/lapis/connection.py index 51a7b43..b97be78 100644 --- a/lapis/connection.py +++ b/lapis/connection.py @@ -29,7 +29,7 @@ def add_storage_element(self, storage_element: Storage): except KeyError: self.storages[storage_element.sitename] = [storage_element] - async def determine_inputfile_source( + async def _determine_inputfile_source( self, requested_file: RequestedFile, dronesite: str, job_repr: str ): """ @@ -76,7 +76,7 @@ async def stream_file(self, requested_file: RequestedFile, dronesite, job_repr): :param job_repr: :return: """ - used_connection = await self.determine_inputfile_source( + used_connection = await self._determine_inputfile_source( requested_file, dronesite, job_repr ) From 5282e1b62eda980a550ce6ff93a5fd232d4cea92 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Wed, 27 Nov 2019 22:50:29 +0100 Subject: [PATCH 417/648] renamed transfer_inputfiles to transfer_files --- lapis/connection.py | 2 +- lapis/job.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/lapis/connection.py b/lapis/connection.py index b97be78..1cb6c48 100644 --- a/lapis/connection.py +++ b/lapis/connection.py @@ -99,7 +99,7 @@ async def stream_file(self, requested_file: RequestedFile, dronesite, job_repr): ) ) - async def transfer_inputfiles(self, drone, requested_files: dict, job_repr): + async def transfer_files(self, drone, requested_files: dict, job_repr): """ Converts dict information about requested files to RequestedFile object and parallely launches streaming for all files diff --git a/lapis/job.py b/lapis/job.py index b58bf46..7ca7844 100644 --- a/lapis/job.py +++ b/lapis/job.py @@ -125,7 +125,7 @@ def get_calculation_time(self, calculation_efficiency=0.9): async def transfer_inputfiles(self): print("TRANSFERING INPUTFILES: Job {} @ {}".format(repr(self), time.now)) if self.drone.connection and self.used_inputfiles: - self._streamtime = await self.drone.connection.transfer_inputfiles( + self._streamtime = await self.drone.connection.transfer_files( self.drone, self.requested_inputfiles, repr(self) ) From 4a0cdc3178d9310297501cfdddedc208cbc49ca8 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Wed, 27 Nov 2019 23:10:18 +0100 Subject: [PATCH 418/648] removed queue from file lookup in storage and improved determine inputfile source --- lapis/connection.py | 27 +++++++++++---------------- lapis/storage.py | 13 ++++++------- 2 files changed, 17 insertions(+), 23 deletions(-) diff --git a/lapis/connection.py b/lapis/connection.py index 1cb6c48..c64db94 100644 --- a/lapis/connection.py +++ b/lapis/connection.py @@ -4,7 +4,7 @@ from lapis.files import RequestedFile from lapis.monitor import sampling_required -from usim import Queue, Scope, time, Pipe +from usim import Scope, time, Pipe import random @@ -45,25 +45,20 @@ async def _determine_inputfile_source( :return: """ provided_storages = self.storages.get(dronesite, None) - if provided_storages: - look_up_queue = Queue() - async with Scope() as scope: - for storage in provided_storages: - scope.do( - storage.look_up_file(requested_file, look_up_queue, job_repr) - ) - await look_up_queue.close() + if provided_storages is not None: + look_up_list = [] + for storage in provided_storages: + look_up_list.append(storage.look_up_file(requested_file, job_repr)) storage_list = sorted( - [entry async for entry in look_up_queue], + [entry async for entry in look_up_list], key=lambda x: x[0], reverse=True, ) - if storage_list[0].cached_filesize > 0: - return storage_list[0].storage - else: - return self.remote_connection - else: - return self.remote_connection + for entry in storage_list: + # TODO: check should better check that size is bigger than requested + if entry.cached_filesize > 0: + return entry.storage + return self.remote_connection async def stream_file(self, requested_file: RequestedFile, dronesite, job_repr): """ diff --git a/lapis/storage.py b/lapis/storage.py index 312e130..44af9d0 100644 --- a/lapis/storage.py +++ b/lapis/storage.py @@ -1,4 +1,4 @@ -from usim import time, Resources, Pipe, Queue +from usim import time, Resources, Pipe from typing import Optional, NamedTuple @@ -153,12 +153,11 @@ async def apply_caching_decision(self, requested_file: RequestedFile, job_repr): for file in to_be_removed: await self.remove(file, job_repr) - async def look_up_file(self, requested_file: RequestedFile, queue: Queue, job_repr): + def look_up_file(self, requested_file: RequestedFile, job_repr): """ Searches storage object for the requested_file and sends result (amount of cached data, storage object) to the queue. :param requested_file: - :param queue: :param job_repr: Needed for debug output, will be replaced :return: (amount of cached data, storage object) """ @@ -167,10 +166,9 @@ async def look_up_file(self, requested_file: RequestedFile, queue: Queue, job_re job_repr, requested_file.filename, repr(self), time.now ) ) - try: - await queue.put( - LookUpInformation(self.files[requested_file.filename].filesize, self) + result = LookUpInformation( + self.files[requested_file.filename].filesize, self ) except KeyError: print( @@ -178,7 +176,8 @@ async def look_up_file(self, requested_file: RequestedFile, queue: Queue, job_re requested_file.filename ) ) - await queue.put(LookUpInformation(0, self)) + result = LookUpInformation(0, self) + return result def __repr__(self): return "<%s: %s>" % (self.__class__.__name__, self.name or id(self)) From 535ac0ca8f1f30f611351749d016aceabb0d907d Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Wed, 27 Nov 2019 23:10:47 +0100 Subject: [PATCH 419/648] improved stream file in connection --- lapis/connection.py | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/lapis/connection.py b/lapis/connection.py index c64db94..0ba3751 100644 --- a/lapis/connection.py +++ b/lapis/connection.py @@ -75,24 +75,22 @@ async def stream_file(self, requested_file: RequestedFile, dronesite, job_repr): requested_file, dronesite, job_repr ) + await sampling_required.put(used_connection) if used_connection == self.remote_connection and self.storages.get( dronesite, None ): - await sampling_required.put(used_connection) - potential_cache = random.choice(self.storages[dronesite]) - await used_connection.transfer(requested_file.filesize) - if potential_cache: + try: + potential_cache = random.choice(self.storages[dronesite]) await potential_cache.apply_caching_decision(requested_file, job_repr) - - else: - await sampling_required.put(used_connection) - print("now transfering", requested_file.filesize) - await used_connection.transfer(requested_file.filesize, job_repr) - print( - "Job {}: finished transfering of file {}: {}GB @ {}".format( - job_repr, requested_file.filename, requested_file.filesize, time.now - ) + except KeyError: + pass + print(f"now transfering {requested_file.filesize} from {used_connection}") + await used_connection.transfer(requested_file.filesize, job_repr) + print( + "Job {}: finished transfering of file {}: {}GB @ {}".format( + job_repr, requested_file.filename, requested_file.filesize, time.now ) + ) async def transfer_files(self, drone, requested_files: dict, job_repr): """ From c3a6b2237d2248cb6ccf9016cc787eab55c7ea38 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Wed, 27 Nov 2019 23:39:47 +0100 Subject: [PATCH 420/648] introduced HitrateStorage that transfers data based on a cache hitrate --- lapis/storage.py | 35 +++++++++++++++++++++++++++++++++-- 1 file changed, 33 insertions(+), 2 deletions(-) diff --git a/lapis/storage.py b/lapis/storage.py index 44af9d0..7694916 100644 --- a/lapis/storage.py +++ b/lapis/storage.py @@ -1,4 +1,4 @@ -from usim import time, Resources, Pipe +from usim import time, Resources, Pipe, Scope from typing import Optional, NamedTuple @@ -24,6 +24,7 @@ class Storage(object): "filenames", "cachealgorithm", "connection", + "remote_connection", ) def __init__( @@ -45,7 +46,7 @@ def __init__( ) self.cachealgorithm = CacheAlgorithm(self) self.connection = Pipe(throughput_limit) - self.__repr__() + self.remote_connection = None @property def usedstorage(self): @@ -181,3 +182,33 @@ def look_up_file(self, requested_file: RequestedFile, job_repr): def __repr__(self): return "<%s: %s>" % (self.__class__.__name__, self.name or id(self)) + + +class HitrateStorage(Storage): + def __init__( + self, + hitrate, + name: Optional[str] = None, + sitename: Optional[str] = None, + storagesize: int = 1000, + throughput_limit: int = 10, + files: Optional[dict] = None, + ): + super(HitrateStorage, self).__init__( + name=name, + sitename=sitename, + storagesize=storagesize, + throughput_limit=throughput_limit, + files=files, + ) + self._hitrate = hitrate + + async def transfer(self, file, job_repr): + async with Scope() as scope: + scope.do(self.connection.transfer(total=self._hitrate * file.filesize)) + scope.do( + self.remote_connection.transfer(total=1 - self._hitrate * file.filesize) + ) + + def look_up_file(self, requested_file: RequestedFile, job_repr): + return LookUpInformation(requested_file.filesize) From b73b21f10f48e2253a4d09118c9af0b9b2daad43 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Wed, 27 Nov 2019 23:50:44 +0100 Subject: [PATCH 421/648] fixed position of noqa --- lapis/connection.py | 6 +++--- lapis/storage.py | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/lapis/connection.py b/lapis/connection.py index 0ba3751..e32e2e3 100644 --- a/lapis/connection.py +++ b/lapis/connection.py @@ -1,12 +1,12 @@ +import random import logging +from usim import Scope, time, Pipe + from lapis.storage import Storage from lapis.files import RequestedFile from lapis.monitor import sampling_required -from usim import Scope, time, Pipe -import random - class Connection(object): diff --git a/lapis/storage.py b/lapis/storage.py index 7694916..9de66c3 100644 --- a/lapis/storage.py +++ b/lapis/storage.py @@ -1,7 +1,7 @@ -from usim import time, Resources, Pipe, Scope - from typing import Optional, NamedTuple +from usim import time, Resources, Pipe, Scope + from lapis.files import StoredFile, RequestedFile from lapis.cachealgorithm import CacheAlgorithm From 3b4bd41f19d5da7a7747ef4060e082427b903b64 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Wed, 27 Nov 2019 23:52:40 +0100 Subject: [PATCH 422/648] removed cachehitrate from connection --- lapis/cli/simulate.py | 2 +- lapis/connection.py | 5 ++--- lapis/simulator.py | 4 ++-- 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/lapis/cli/simulate.py b/lapis/cli/simulate.py index aebb530..1dc941b 100644 --- a/lapis/cli/simulate.py +++ b/lapis/cli/simulate.py @@ -97,7 +97,7 @@ def static(ctx, job_file, pool_file, storage_files, remote_throughput, cache_hit simulator.create_scheduler(scheduler_type=CondorJobScheduler) if all(storage_files): - simulator.create_connection_module(remote_throughput, cache_hitrate) + simulator.create_connection_module(remote_throughput) storage_file, storage_content_file, storage_type = storage_files simulator.create_storage( storage_input=storage_file, diff --git a/lapis/connection.py b/lapis/connection.py index e32e2e3..4c50102 100644 --- a/lapis/connection.py +++ b/lapis/connection.py @@ -10,12 +10,11 @@ class Connection(object): - __slots__ = ("storages", "remote_connection", "cachehitrate") + __slots__ = ("storages", "remote_connection") - def __init__(self, throughput=100, cache_hitrate=None): + def __init__(self, throughput=100): self.storages = dict() self.remote_connection = Pipe(throughput=throughput) - self.cachehitrate = cache_hitrate def add_storage_element(self, storage_element: Storage): """ diff --git a/lapis/simulator.py b/lapis/simulator.py index 18951f9..c7ab049 100644 --- a/lapis/simulator.py +++ b/lapis/simulator.py @@ -77,8 +77,8 @@ def create_storage(self, storage_input, storage_content_input, storage_reader): def create_scheduler(self, scheduler_type): self.job_scheduler = scheduler_type(job_queue=self.job_queue) - def create_connection_module(self, remote_throughput, cache_hitrate): - self.connection = Connection(remote_throughput, cache_hitrate) + def create_connection_module(self, remote_throughput): + self.connection = Connection(remote_throughput) def run(self, until=None): print(f"running until {until}") From 816216722642a193e243827d1f0c88caf9894fee Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Wed, 27 Nov 2019 23:54:15 +0100 Subject: [PATCH 423/648] connection now sets reference to remote_connection for storage --- lapis/connection.py | 1 + 1 file changed, 1 insertion(+) diff --git a/lapis/connection.py b/lapis/connection.py index 4c50102..b7958c8 100644 --- a/lapis/connection.py +++ b/lapis/connection.py @@ -23,6 +23,7 @@ def add_storage_element(self, storage_element: Storage): :param storage_element: :return: """ + storage_element.remote_connection = self.remote_connection try: self.storages[storage_element.sitename].append(storage_element) except KeyError: From 18238e4254280a79fc4d4073a1edad57bd00a5f0 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Wed, 27 Nov 2019 23:55:13 +0100 Subject: [PATCH 424/648] storage objects are now created based on specified cache hit rate --- lapis/cli/simulate.py | 6 +++++ lapis/connection.py | 54 +------------------------------------ lapis/simulator.py | 8 ++++-- lapis/storage_io/storage.py | 8 +++--- 4 files changed, 18 insertions(+), 58 deletions(-) diff --git a/lapis/cli/simulate.py b/lapis/cli/simulate.py index 1dc941b..3fcf545 100644 --- a/lapis/cli/simulate.py +++ b/lapis/cli/simulate.py @@ -1,3 +1,5 @@ +from functools import partial + import click import logging.handlers @@ -9,6 +11,7 @@ from lapis.pool import StaticPool, Pool from lapis.pool_io.htcondor import htcondor_pool_reader from lapis.job_io.swf import swf_job_reader +from lapis.storage import Storage, HitrateStorage from lapis.storage_io.storage import storage_reader from lapis.scheduler import CondorJobScheduler @@ -103,6 +106,9 @@ def static(ctx, job_file, pool_file, storage_files, remote_throughput, cache_hit storage_input=storage_file, storage_content_input=storage_content_file, storage_reader=storage_import_mapper[storage_type], + storage_type=partial(HitrateStorage, cache_hitrate) + if cache_hitrate is not None + else Storage, ) for current_pool in pool_file: pool_file, pool_file_type = current_pool diff --git a/lapis/connection.py b/lapis/connection.py index b7958c8..91f3b15 100644 --- a/lapis/connection.py +++ b/lapis/connection.py @@ -1,5 +1,4 @@ import random -import logging from usim import Scope, time, Pipe @@ -108,60 +107,9 @@ async def transfer_files(self, drone, requested_files: dict, job_repr): requested_file = RequestedFile( inputfilename, inputfilespecs["filesize"] ) - if self.cachehitrate is not None: - - scope.do( - self.transfer_by_cache_hitrate( - self.storages.get(drone.sitename, None), requested_file - ) - ) - else: - scope.do(self.stream_file(requested_file, drone.sitename, job_repr)) + scope.do(self.stream_file(requested_file, drone.sitename, job_repr)) stream_time = time.now - start_time print( "STREAMED files {} in {}".format(list(requested_files.keys()), stream_time) ) return stream_time - - async def transfer_by_cache_hitrate( - self, available_storages: Storage, requested_file: RequestedFile - ): - if not available_storages and self.cachehitrate: - logging.getLogger("implementation").error( - "no available caches for drone " - " requested cachehitrate was " - "{}".format(self.cachehitrate) - ) - else: - if 0 < self.cachehitrate < 1: - async with Scope() as scope: - scope.do( - self.transfer_wrapper( - self.remote_connection, - (1.0 - self.cachehitrate) * requested_file.filesize, - ) - ) - scope.do( - self.transfer_wrapper( - available_storages[0].connection, - self.cachehitrate * requested_file.filesize, - ) - ) - elif self.cachehitrate == 1: - await available_storages[0].connection.transfer(requested_file.filesize) - elif self.cachehitrate == 0: - await self.remote_connection.transfer(requested_file.filesize) - - async def transfer_wrapper(self, connection, total): - print( - "transfering {} with {}, start @ {}".format( - total, connection.throughput, time.now - ) - ) - await sampling_required.put(connection) - await connection.transfer(total=total) - print( - "transfering {} with {}, stop @ {}".format( - total, connection.throughput, time.now - ) - ) diff --git a/lapis/simulator.py b/lapis/simulator.py index c7ab049..beb33a9 100644 --- a/lapis/simulator.py +++ b/lapis/simulator.py @@ -67,10 +67,14 @@ def create_pools(self, pool_input, pool_reader, pool_type, controller=None): if controller: self.controllers.append(controller(target=pool, rate=1)) - def create_storage(self, storage_input, storage_content_input, storage_reader): + def create_storage( + self, storage_input, storage_content_input, storage_reader, storage_type + ): assert self.connection, "Connection module needs to be created before storages" for storage in storage_reader( - storage=storage_input, storage_content=storage_content_input + storage=storage_input, + storage_content=storage_content_input, + storage_type=storage_type, ): self.connection.add_storage_element(storage) diff --git a/lapis/storage_io/storage.py b/lapis/storage_io/storage.py index 1e0dabb..9d9e389 100644 --- a/lapis/storage_io/storage.py +++ b/lapis/storage_io/storage.py @@ -1,12 +1,13 @@ import csv +from functools import partial from lapis.files import StoredFile -from lapis.storage import Storage def storage_reader( storage, storage_content, + storage_type, unit_conversion_mapping: dict = { # noqa: B006 "cachesizeGB": 1024 * 1024 * 1024, "throughput_limit": 1024 * 1024 * 1024, @@ -15,7 +16,8 @@ def storage_reader( storage_content = storage_content_reader(storage_content) reader = csv.DictReader(storage, delimiter=" ", quotechar="'") for row in reader: - yield Storage( + yield partial( + storage_type, name=row["name"], sitename=row["sitename"], storagesize=int( @@ -23,7 +25,7 @@ def storage_reader( ), throughput_limit=int(row["throughput_limit"]), files=storage_content[row["name"]], - ) + )() def storage_content_reader( From 2df47697b6a827a71b8752b2784deadc836ef7b1 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Thu, 28 Nov 2019 13:38:19 +0100 Subject: [PATCH 425/648] adapted usage of caching for jobs --- lapis/job.py | 87 +++++++++++++++++++++++----------------------------- 1 file changed, 39 insertions(+), 48 deletions(-) diff --git a/lapis/job.py b/lapis/job.py index 7ca7844..063e8c8 100644 --- a/lapis/job.py +++ b/lapis/job.py @@ -1,7 +1,7 @@ import logging from typing import Optional, TYPE_CHECKING -from usim import time +from usim import time, Scope, instant from usim import CancelTask from lapis.monitor import sampling_required @@ -14,9 +14,7 @@ class Job(object): __slots__ = ( "resources", "used_resources", - "_walltime", - "_calculationtime", - "_streamtime", + "walltime", "requested_walltime", "queue_date", "requested_inputfiles", @@ -61,12 +59,8 @@ def __init__( self.used_resources[key], ) self.resources[key] = self.used_resources[key] - self._walltime = used_resources.pop("walltime") - self._calculationtime = self.get_calculation_time() - self._streamtime = 0 + self.walltime = used_resources.pop("walltime") self.requested_walltime = resources.pop("walltime", None) - self.requested_inputfiles = resources.pop("inputfiles", None) - self.used_inputfiles = used_resources.pop("inputfiles", None) self.queue_date = queue_date assert in_queue_since >= 0, "Queue time cannot be negative" self.in_queue_since = in_queue_since @@ -75,6 +69,10 @@ def __init__( self._name = name self._success: Optional[bool] = None + # caching-related + self.requested_inputfiles = resources.pop("inputfiles", None) + self.used_inputfiles = used_resources.pop("inputfiles", None) + @property def name(self) -> str: return self._name or id(self) @@ -95,49 +93,40 @@ def waiting_time(self) -> float: return self.in_queue_until - self.in_queue_since return float("Inf") - @property - def walltime(self) -> float: - """ - :return: Time that passes while job is running - """ - return self._streamtime + self.calculation_time - - @property - def calculation_time(self): - print("WALLTIME: Job {} @ {}".format(repr(self), time.now)) - return self._calculationtime - - def get_calculation_time(self, calculation_efficiency=0.9): + async def _calculate(self, calculation_efficiency=0.9): """ Determines a jobs calculation time based on the jobs CPU time and a calculation efficiency representing inefficient programming. :param calculation_efficiency: :return: """ + print(f"WALLTIME: Job {self} @ {time.now}") + result = self.walltime try: - return ( + result = ( self.used_resources["cores"] / calculation_efficiency - ) * self._walltime + ) * self.walltime except KeyError: - # logging.getLogger("implementation").info() - return self._walltime - - async def transfer_inputfiles(self): - print("TRANSFERING INPUTFILES: Job {} @ {}".format(repr(self), time.now)) - if self.drone.connection and self.used_inputfiles: - self._streamtime = await self.drone.connection.transfer_files( - self.drone, self.requested_inputfiles, repr(self) - ) + pass + start = time.now + await (time + result) + print(f"finished calculation at {time.now - start}") + async def _transfer_inputfiles(self): + try: + start = time.now + print(f"TRANSFERING INPUTFILES: Job {self} @ {start}") + await self.drone.connection.transfer_files( + drone=self.drone, + requested_files=self.used_inputfiles, + job_repr=repr(self), + ) print( - "streamed inputfiles {} for job {} in {} timeunits, finished @ {}" - "".format( - self.requested_inputfiles.keys(), - repr(self), - self._streamtime, - time.now, - ) + f"streamed inputfiles {self.used_inputfiles.keys()} for job {self} " + f"in {time.now - start} timeunits, finished @ {time.now}" ) + except AttributeError: + pass async def run(self, drone: "Drone"): assert drone, "Jobs cannot run without a drone being assigned" @@ -147,22 +136,24 @@ async def run(self, drone: "Drone"): await sampling_required.put(self) print("running job {} in drone {}".format(repr(self), repr(self.drone))) try: - if self.requested_inputfiles: - await self.transfer_inputfiles() - await (time + self.calculation_time) - else: - # ToDo: improve handling of jobs without inputfiles (correct value in - # self.walltime and therefore in monitoring etc) - await (time + self._walltime) - print(self.calculation_time, self._streamtime, self.walltime) + start = time.now + async with Scope() as scope: + await instant + scope.do(self._transfer_inputfiles()) + scope.do(self._calculate()) except CancelTask: self.drone = None self._success = False + # TODO: in_queue_until is still set except BaseException: self.drone = None self._success = False + # TODO: in_queue_until is still set raise else: + old_walltime = self.walltime + self.walltime = time.now - start + print(f"monitored walltime of {old_walltime} changed to {self.walltime}") self.drone = None self._success = True await sampling_required.put(self) From 24a024025f5431b4183db4b5b973b6772a3e8f1f Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Thu, 28 Nov 2019 14:08:09 +0100 Subject: [PATCH 426/648] introduced calculation efficiency to job --- lapis/job.py | 9 ++++++--- lapis/job_io/htcondor.py | 3 +++ lapis/job_io/swf.py | 3 +++ 3 files changed, 12 insertions(+), 3 deletions(-) diff --git a/lapis/job.py b/lapis/job.py index 063e8c8..b50ffd0 100644 --- a/lapis/job.py +++ b/lapis/job.py @@ -24,6 +24,7 @@ class Job(object): "_name", "drone", "_success", + "calculation_efficiency", ) def __init__( @@ -34,6 +35,7 @@ def __init__( queue_date: float = 0, name: str = None, drone: "Drone" = None, + calculation_efficiency: Optional[float] = None, ): """ Definition of a job that uses a specified amount of resources `used_resources` @@ -68,6 +70,7 @@ def __init__( self.drone = drone self._name = name self._success: Optional[bool] = None + self.calculation_efficiency = calculation_efficiency # caching-related self.requested_inputfiles = resources.pop("inputfiles", None) @@ -93,7 +96,7 @@ def waiting_time(self) -> float: return self.in_queue_until - self.in_queue_since return float("Inf") - async def _calculate(self, calculation_efficiency=0.9): + async def _calculate(self): """ Determines a jobs calculation time based on the jobs CPU time and a calculation efficiency representing inefficient programming. @@ -104,9 +107,9 @@ async def _calculate(self, calculation_efficiency=0.9): result = self.walltime try: result = ( - self.used_resources["cores"] / calculation_efficiency + self.used_resources["cores"] / self.calculation_efficiency ) * self.walltime - except KeyError: + except (KeyError, TypeError): pass start = time.now await (time + result) diff --git a/lapis/job_io/htcondor.py b/lapis/job_io/htcondor.py index f22be5e..c95183d 100644 --- a/lapis/job_io/htcondor.py +++ b/lapis/job_io/htcondor.py @@ -1,6 +1,7 @@ import csv import json import logging +from typing import Optional from lapis.job import Job from copy import deepcopy @@ -8,6 +9,7 @@ def htcondor_job_reader( iterable, + calculation_efficiency: Optional[float] = None, resource_name_mapping={ # noqa: B006 "cores": "RequestCpus", "walltime": "RequestWalltime", # s @@ -91,4 +93,5 @@ def htcondor_job_reader( resources=resources, used_resources=used_resources, queue_date=float(entry[used_resource_name_mapping["queuetime"]]), + calculation_efficiency=calculation_efficiency, ) diff --git a/lapis/job_io/swf.py b/lapis/job_io/swf.py index 99124bd..bc75d20 100644 --- a/lapis/job_io/swf.py +++ b/lapis/job_io/swf.py @@ -4,12 +4,14 @@ [Standard Workload Format](http://www.cs.huji.ac.il/labs/parallel/workload/swf.html). """ import csv +from typing import Optional from lapis.job import Job def swf_job_reader( iterable, + calculation_efficiency: Optional[float] = None, resource_name_mapping={ # noqa: B006 "cores": "Requested Number of Processors", "walltime": "Requested Time", # s @@ -90,4 +92,5 @@ def swf_job_reader( used_resources=used_resources, queue_date=float(row[header[used_resource_name_mapping["queuetime"]]]), name=row[header["Job Number"]], + calculation_efficiency=calculation_efficiency, ) From f984017301ad542c7f6863cdd86beb14e81dd736 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Thu, 28 Nov 2019 14:09:25 +0100 Subject: [PATCH 427/648] introduced calculation efficiency for jobs to cli --- lapis/cli/simulate.py | 22 ++++++++++++++++++---- lapis/simulator.py | 2 +- 2 files changed, 19 insertions(+), 5 deletions(-) diff --git a/lapis/cli/simulate.py b/lapis/cli/simulate.py index 3fcf545..d2a1701 100644 --- a/lapis/cli/simulate.py +++ b/lapis/cli/simulate.py @@ -38,11 +38,13 @@ @click.option("--log-tcp", "log_tcp", is_flag=True) @click.option("--log-file", "log_file", type=click.File("w")) @click.option("--log-telegraf", "log_telegraf", is_flag=True) +@click.option("--calculation-efficiency", type=float) @click.pass_context -def cli(ctx, seed, until, log_tcp, log_file, log_telegraf): +def cli(ctx, seed, until, log_tcp, log_file, log_telegraf, calculation_efficiency): ctx.ensure_object(dict) ctx.obj["seed"] = seed ctx.obj["until"] = until + ctx.obj["calculation_efficiency"] = calculation_efficiency monitoring_logger = logging.getLogger() monitoring_logger.setLevel(logging.DEBUG) time_filter = SimulationTimeFilter() @@ -95,7 +97,11 @@ def static(ctx, job_file, pool_file, storage_files, remote_throughput, cache_hit simulator = Simulator(seed=ctx.obj["seed"]) file, file_type = job_file simulator.create_job_generator( - job_input=file, job_reader=job_import_mapper[file_type] + job_input=file, + job_reader=partial( + job_import_mapper[file_type], + calculation_efficiency=ctx.obj["calculation_efficiency"], + ), ) simulator.create_scheduler(scheduler_type=CondorJobScheduler) @@ -139,7 +145,11 @@ def dynamic(ctx, job_file, pool_file): simulator = Simulator(seed=ctx.obj["seed"]) file, file_type = job_file simulator.create_job_generator( - job_input=file, job_reader=job_import_mapper[file_type] + job_input=file, + job_reader=partial( + job_import_mapper[file_type], + calculation_efficiency=ctx.obj["calculation_efficiency"], + ), ) simulator.create_scheduler(scheduler_type=CondorJobScheduler) for current_pool in pool_file: @@ -178,7 +188,11 @@ def hybrid(ctx, job_file, static_pool_file, dynamic_pool_file): simulator = Simulator(seed=ctx.obj["seed"]) file, file_type = job_file simulator.create_job_generator( - job_input=file, job_reader=job_import_mapper[file_type] + job_input=file, + job_reader=partial( + job_import_mapper[file_type], + calculation_efficiency=ctx.obj["calculation_efficiency"], + ), ) simulator.create_scheduler(scheduler_type=CondorJobScheduler) for current_pool in static_pool_file: diff --git a/lapis/simulator.py b/lapis/simulator.py index beb33a9..b2dcda4 100644 --- a/lapis/simulator.py +++ b/lapis/simulator.py @@ -104,5 +104,5 @@ async def _simulate(self, end): async def _queue_jobs(self, job_input, job_reader): await job_to_queue_scheduler( - job_generator=job_reader(job_input), job_queue=self.job_queue + job_generator=partial(job_reader, job_input)(), job_queue=self.job_queue ) From 3b011ebd58891574f7d85d2c524fead039e99ac5 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Thu, 28 Nov 2019 14:09:54 +0100 Subject: [PATCH 428/648] added more type hints for job --- lapis/job.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/lapis/job.py b/lapis/job.py index b50ffd0..fd15b93 100644 --- a/lapis/job.py +++ b/lapis/job.py @@ -33,8 +33,8 @@ def __init__( used_resources: dict, in_queue_since: float = 0, queue_date: float = 0, - name: str = None, - drone: "Drone" = None, + name: Optional[str] = None, + drone: "Optional[Drone]" = None, calculation_efficiency: Optional[float] = None, ): """ @@ -61,12 +61,12 @@ def __init__( self.used_resources[key], ) self.resources[key] = self.used_resources[key] - self.walltime = used_resources.pop("walltime") - self.requested_walltime = resources.pop("walltime", None) + self.walltime: int = used_resources.pop("walltime") + self.requested_walltime: Optional[int] = resources.pop("walltime", None) self.queue_date = queue_date assert in_queue_since >= 0, "Queue time cannot be negative" self.in_queue_since = in_queue_since - self.in_queue_until = None + self.in_queue_until: Optional[float] = None self.drone = drone self._name = name self._success: Optional[bool] = None From 9cc625a1d130b305058d1b21b88a03aba2f9f96a Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Thu, 28 Nov 2019 16:08:34 +0100 Subject: [PATCH 429/648] removed initialisation of connection --- lapis/drone.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/lapis/drone.py b/lapis/drone.py index 7217b32..ffe8c61 100644 --- a/lapis/drone.py +++ b/lapis/drone.py @@ -28,10 +28,7 @@ def __init__( """ super(Drone, self).__init__() self.scheduler = scheduler - if connection is not None: - self.connection = connection - else: - self.connection = Connection() + self.connection = connection self.sitename = sitename self.pool_resources = pool_resources self.resources = Capacities(**pool_resources) From 08d29072c6cdb58d0997acdacf569a4f7f633695 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Thu, 28 Nov 2019 17:04:04 +0100 Subject: [PATCH 430/648] moved caching related monitoring to extra file --- lapis/monitor/caching.py | 91 ++++++++++++++++++++++++++++++++++++++++ lapis/monitor/general.py | 86 ------------------------------------- 2 files changed, 91 insertions(+), 86 deletions(-) create mode 100644 lapis/monitor/caching.py diff --git a/lapis/monitor/caching.py b/lapis/monitor/caching.py new file mode 100644 index 0000000..b1068d6 --- /dev/null +++ b/lapis/monitor/caching.py @@ -0,0 +1,91 @@ +import logging + +from cobald.monitor.format_json import JsonFormatter +from cobald.monitor.format_line import LineProtocolFormatter +from usim import Pipe + +from lapis.monitor import LoggingSocketHandler, LoggingUDPSocketHandler +from lapis.storage import Storage + + +def storage_status(storage: Storage) -> list: + """ + Log information about current storage object state + :param storage: + :return: list of records for logging + """ + results = [ + { + "storage": repr(storage), + "usedstorage": storage.usedstorage, + "storagesize": storage.storagesize, + "numberoffiles": len(storage.files), + } + ] + return results + + +storage_status.name = "storage_status" +storage_status.whitelist = (Storage,) +storage_status.logging_formatter = { + LoggingSocketHandler.__name__: JsonFormatter(), + logging.StreamHandler.__name__: JsonFormatter(), + LoggingUDPSocketHandler.__name__: LineProtocolFormatter( + tags={"tardis", "storage"}, resolution=1 + ), +} + + +def storage_connection(storage: Storage) -> list: + """ + Log information about the storages connection + :param storage: + :return: + """ + results = [ + { + "storage": repr(storage), + "throughput": storage.connection.throughput, + "requested_throughput": sum(storage.connection._subscriptions.values()), + "throughput_scale": storage.connection._throughput_scale, + } + ] + return results + + +storage_connection.name = "storage_connection" +storage_connection.whitelist = (Storage,) +storage_connection.logging_formatter = { + LoggingSocketHandler.__name__: JsonFormatter(), + logging.StreamHandler.__name__: JsonFormatter(), + LoggingUDPSocketHandler.__name__: LineProtocolFormatter( + tags={"tardis", "storage"}, resolution=1 + ), +} + + +def remote_connection(remote: Pipe) -> list: + """ + Log information about the remote connection + :param remote: + :return: + """ + results = [ + { + "throughput": remote.throughput, + "requested_throughput": sum(remote._subscriptions.values()), + "throughput_scale": remote._throughput_scale, + } + ] + return results + + +remote_connection.name = "remote_connection" +remote_connection.whitelist = (Pipe,) +remote_connection.logging_formatter = { + LoggingSocketHandler.__name__: JsonFormatter(), + logging.StreamHandler.__name__: JsonFormatter(), + LoggingUDPSocketHandler.__name__: LineProtocolFormatter( + tags={"tardis"}, resolution=1 + ), +} diff --git a/lapis/monitor/general.py b/lapis/monitor/general.py index 51064ee..be6d24d 100644 --- a/lapis/monitor/general.py +++ b/lapis/monitor/general.py @@ -10,9 +10,6 @@ from lapis.monitor import LoggingSocketHandler, LoggingUDPSocketHandler from lapis.pool import Pool from lapis.scheduler import CondorJobScheduler, JobQueue -from lapis.storage import Storage - -from usim import Pipe if TYPE_CHECKING: from lapis.simulator import Simulator @@ -218,86 +215,3 @@ def configuration_information(simulator: "Simulator") -> List[Dict]: tags={"tardis", "pool_configuration", "resource_type"}, resolution=1 ), } - - -def storage_status(storage: Storage) -> list: - """ - Log information about current storage object state - :param storage: - :return: list of records for logging - """ - results = [ - { - "storage": repr(storage), - "usedstorage": storage.usedstorage, - "storagesize": storage.storagesize, - "numberoffiles": len(storage.files), - } - ] - return results - - -storage_status.name = "storage_status" -storage_status.whitelist = (Storage,) -storage_status.logging_formatter = { - LoggingSocketHandler.__name__: JsonFormatter(), - logging.StreamHandler.__name__: JsonFormatter(), - LoggingUDPSocketHandler.__name__: LineProtocolFormatter( - tags={"tardis", "storage"}, resolution=1 - ), -} - - -def storage_connection(storage: Storage) -> list: - """ - Log information about the storages connection - :param storage: - :return: - """ - results = [ - { - "storage": repr(storage), - "throughput": storage.connection.throughput, - "requested_throughput": sum(storage.connection._subscriptions.values()), - "throughput_scale": storage.connection._throughput_scale, - } - ] - return results - - -storage_connection.name = "storage_connection" -storage_connection.whitelist = (Storage,) -storage_connection.logging_formatter = { - LoggingSocketHandler.__name__: JsonFormatter(), - logging.StreamHandler.__name__: JsonFormatter(), - LoggingUDPSocketHandler.__name__: LineProtocolFormatter( - tags={"tardis", "storage"}, resolution=1 - ), -} - - -def remote_connection(remote: Pipe) -> list: - """ - Log information about the remote connection - :param remote: - :return: - """ - results = [ - { - "throughput": remote.throughput, - "requested_throughput": sum(remote._subscriptions.values()), - "throughput_scale": remote._throughput_scale, - } - ] - return results - - -remote_connection.name = "remote_connection" -remote_connection.whitelist = (Pipe,) -remote_connection.logging_formatter = { - LoggingSocketHandler.__name__: JsonFormatter(), - logging.StreamHandler.__name__: JsonFormatter(), - LoggingUDPSocketHandler.__name__: LineProtocolFormatter( - tags={"tardis"}, resolution=1 - ), -} From e9e6e07b99e2c52d146088612d49078a8184b88c Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Thu, 28 Nov 2019 17:05:05 +0100 Subject: [PATCH 431/648] each simulation run now can be identified --- lapis/monitor/__init__.py | 4 ++++ lapis/simulator.py | 15 +++++++++------ 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/lapis/monitor/__init__.py b/lapis/monitor/__init__.py index c7e4039..9098cca 100644 --- a/lapis/monitor/__init__.py +++ b/lapis/monitor/__init__.py @@ -7,6 +7,9 @@ from usim import time, Queue +SIMULATION_START = None + + class LoggingSocketHandler(logging.handlers.SocketHandler): def makePickle(self, record): return self.format(record).encode() @@ -46,6 +49,7 @@ async def run(self): for statistic in self._statistics.get(type(log_object), set()): # do the logging for record in statistic(log_object): + record["tardis"] = "lapis-%s" % SIMULATION_START logging.getLogger(statistic.name).info(statistic.name, record) def register_statistic(self, statistic: Callable) -> None: diff --git a/lapis/simulator.py b/lapis/simulator.py index b2dcda4..ced7f8d 100644 --- a/lapis/simulator.py +++ b/lapis/simulator.py @@ -1,12 +1,15 @@ import logging import random +import time as pytime from functools import partial from usim import run, time, until, Scope, Queue +import lapis.monitor as monitor from lapis.drone import Drone from lapis.job import job_to_queue_scheduler from lapis.connection import Connection +from lapis.monitor.caching import storage_status, storage_connection, remote_connection from lapis.monitor.general import ( user_demand, job_statistics, @@ -14,9 +17,6 @@ pool_status, configuration_information, job_events, - storage_status, - storage_connection, - remote_connection, ) from lapis.monitor import Monitoring from lapis.monitor.cobald import drone_statistics, pool_statistics @@ -85,11 +85,12 @@ def create_connection_module(self, remote_throughput): self.connection = Connection(remote_throughput) def run(self, until=None): - print(f"running until {until}") + monitor.SIMULATION_START = pytime.time() + print(f"[lapis-{monitor.SIMULATION_START}] running until {until}") run(self._simulate(until)) async def _simulate(self, end): - print(f"Starting simulation at {time.now}") + print(f"[lapis-{monitor.SIMULATION_START}] Starting simulation at {time.now}") async with until(time == end) if end else Scope() as while_running: for pool in self.pools: while_running.do(pool.run(), volatile=True) @@ -100,7 +101,9 @@ async def _simulate(self, end): while_running.do(controller.run(), volatile=True) while_running.do(self.monitoring.run(), volatile=True) self.duration = time.now - print(f"Finished simulation at {self.duration}") + print( + f"[lapis-{monitor.SIMULATION_START}] Finished simulation at {self.duration}" + ) async def _queue_jobs(self, job_input, job_reader): await job_to_queue_scheduler( From ffef6eb671e0fcf1cd135826da4b3bf78f75685c Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Thu, 28 Nov 2019 17:08:23 +0100 Subject: [PATCH 432/648] added caching-specific monitoring information to documentation --- docs/source/topics/monitoring.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/source/topics/monitoring.rst b/docs/source/topics/monitoring.rst index 6651137..e48dba3 100644 --- a/docs/source/topics/monitoring.rst +++ b/docs/source/topics/monitoring.rst @@ -77,9 +77,9 @@ COBalD-specific Monitoring Caching-specific Monitoring ~~~~~~~~~~~~~~~~~~~~~~~~~~~ -.. TODO:: - - Will be added as soon as the caching branch is merged. +.. autofunction:: lapis.monitor.caching.storage_status +.. autofunction:: lapis.monitor.caching.storage_connection +.. autofunction:: lapis.monitor.caching.remote_connection Telegraf -------- From 90d9e075358dbe16878353567ae8a8fbdee2075b Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Thu, 28 Nov 2019 17:18:10 +0100 Subject: [PATCH 433/648] added type hints for simulator --- lapis/simulator.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/lapis/simulator.py b/lapis/simulator.py index ced7f8d..4932af7 100644 --- a/lapis/simulator.py +++ b/lapis/simulator.py @@ -3,6 +3,9 @@ import time as pytime from functools import partial +from typing import List + +from cobald.interfaces import Controller from usim import run, time, until, Scope, Queue import lapis.monitor as monitor @@ -20,7 +23,7 @@ ) from lapis.monitor import Monitoring from lapis.monitor.cobald import drone_statistics, pool_statistics - +from lapis.pool import Pool logging.getLogger("implementation").propagate = False @@ -28,13 +31,12 @@ class Simulator(object): def __init__(self, seed=1234): random.seed(seed) - self.job_queue = Queue() - self.pools = [] + self.job_queue: Queue = Queue() + self.pools: List[Pool] = [] self.connection: Connection = None - self.controllers = [] + self.controllers: List[Controller] = [] self.job_scheduler = None self.job_generator = None - self.cost = 0 self._job_generators = [] self.monitoring = Monitoring() self.duration = None From 479f51e5ba3d79dfcbf9e9bbaa05ebeb63470d4a Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Thu, 28 Nov 2019 17:20:10 +0100 Subject: [PATCH 434/648] changed sizes for storage to bytes --- lapis/storage.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lapis/storage.py b/lapis/storage.py index 9de66c3..42640d7 100644 --- a/lapis/storage.py +++ b/lapis/storage.py @@ -31,8 +31,8 @@ def __init__( self, name: Optional[str] = None, sitename: Optional[str] = None, - storagesize: int = 1000, - throughput_limit: int = 10, + storagesize: int = 1000 * 1024 * 1024 * 1024, + throughput_limit: int = 10 * 1024 * 1024 * 1024, files: Optional[dict] = None, ): self.name = name From 033495e838d9658218c0b232aa95f7036fbc8bcf Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Thu, 28 Nov 2019 20:07:19 +0100 Subject: [PATCH 435/648] fixed bug leading to full RAM --- lapis/storage_io/storage.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/lapis/storage_io/storage.py b/lapis/storage_io/storage.py index 9d9e389..f6b42e3 100644 --- a/lapis/storage_io/storage.py +++ b/lapis/storage_io/storage.py @@ -20,9 +20,8 @@ def storage_reader( storage_type, name=row["name"], sitename=row["sitename"], - storagesize=int( - row["cachesizeGB"] * unit_conversion_mapping.get("cachesizeGB", 1) - ), + storagesize=int(row["cachesizeGB"]) + * unit_conversion_mapping.get("cachesizeGB", 1), throughput_limit=int(row["throughput_limit"]), files=storage_content[row["name"]], )() @@ -39,10 +38,12 @@ def storage_content_reader( cache_information = dict() for row in reader: for key in row: + if key not in ["filename", "cachename"]: + row[key] = int(row[key]) row[key] = row[key] * unit_conversion_mapping.get(key, 1) cache_information.setdefault(row["cachename"], {})[ row["filename"] - ] = StoredFile(row["filename"], **row) + ] = StoredFile(**row) if not cache_information: return None return cache_information From 2e7d9f2a932ced97afdd33334a688238bbeeca65 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Thu, 28 Nov 2019 22:37:36 +0100 Subject: [PATCH 436/648] added RemoteStorage --- lapis/connection.py | 11 ++++++----- lapis/storage.py | 8 ++++++++ 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/lapis/connection.py b/lapis/connection.py index 91f3b15..20c5777 100644 --- a/lapis/connection.py +++ b/lapis/connection.py @@ -1,8 +1,9 @@ import random +from typing import Union from usim import Scope, time, Pipe -from lapis.storage import Storage +from lapis.storage import Storage, RemoteStorage from lapis.files import RequestedFile from lapis.monitor import sampling_required @@ -13,7 +14,7 @@ class Connection(object): def __init__(self, throughput=100): self.storages = dict() - self.remote_connection = Pipe(throughput=throughput) + self.remote_connection = RemoteStorage(Pipe(throughput=throughput)) def add_storage_element(self, storage_element: Storage): """ @@ -30,7 +31,7 @@ def add_storage_element(self, storage_element: Storage): async def _determine_inputfile_source( self, requested_file: RequestedFile, dronesite: str, job_repr: str - ): + ) -> Union[Storage, RemoteStorage]: """ Collects NamedTuples containing the amount of data of the requested file cached in a storage element and the storage element for all reachable storage @@ -80,11 +81,11 @@ async def stream_file(self, requested_file: RequestedFile, dronesite, job_repr): ): try: potential_cache = random.choice(self.storages[dronesite]) - await potential_cache.apply_caching_decision(requested_file, job_repr) + await potential_cache._apply_caching_decision(requested_file, job_repr) except KeyError: pass print(f"now transfering {requested_file.filesize} from {used_connection}") - await used_connection.transfer(requested_file.filesize, job_repr) + await used_connection.transfer(requested_file, job_repr) print( "Job {}: finished transfering of file {}: {}GB @ {}".format( job_repr, requested_file.filename, requested_file.filesize, time.now diff --git a/lapis/storage.py b/lapis/storage.py index 42640d7..3c98fca 100644 --- a/lapis/storage.py +++ b/lapis/storage.py @@ -11,6 +11,14 @@ class LookUpInformation(NamedTuple): storage: "Storage" +class RemoteStorage(object): + def __init__(self, pipe: Pipe): + self._connection = pipe + + async def transfer(self, file, job_repr): + await self._connection.transfer(total=file.filesize) + + class Storage(object): __slots__ = ( From 1b6ab4b6a34f1bdf2f65b318f172d1281093c28b Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Thu, 28 Nov 2019 22:39:25 +0100 Subject: [PATCH 437/648] renamed storagesize to size and ensured correct units --- lapis/monitor/caching.py | 2 +- lapis/storage.py | 14 +++++++------- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/lapis/monitor/caching.py b/lapis/monitor/caching.py index b1068d6..604c748 100644 --- a/lapis/monitor/caching.py +++ b/lapis/monitor/caching.py @@ -18,7 +18,7 @@ def storage_status(storage: Storage) -> list: { "storage": repr(storage), "usedstorage": storage.usedstorage, - "storagesize": storage.storagesize, + "storagesize": storage.size, "numberoffiles": len(storage.files), } ] diff --git a/lapis/storage.py b/lapis/storage.py index 3c98fca..be50b10 100644 --- a/lapis/storage.py +++ b/lapis/storage.py @@ -24,7 +24,7 @@ class Storage(object): __slots__ = ( "name", "sitename", - "storagesize", + "size", "deletion_duration", "update_duration", "_usedstorage", @@ -39,7 +39,7 @@ def __init__( self, name: Optional[str] = None, sitename: Optional[str] = None, - storagesize: int = 1000 * 1024 * 1024 * 1024, + size: int = 1000 * 1024 * 1024 * 1024, throughput_limit: int = 10 * 1024 * 1024 * 1024, files: Optional[dict] = None, ): @@ -47,7 +47,7 @@ def __init__( self.sitename = sitename self.deletion_duration = 5 self.update_duration = 1 - self.storagesize = storagesize + self.size = size self.files = files self._usedstorage = Resources( size=sum(file.filesize for file in files.values()) @@ -62,7 +62,7 @@ def usedstorage(self): @property def free_space(self): - return self.storagesize - self.usedstorage + return self.size - self.usedstorage async def remove(self, file: StoredFile, job_repr): """ @@ -198,14 +198,14 @@ def __init__( hitrate, name: Optional[str] = None, sitename: Optional[str] = None, - storagesize: int = 1000, - throughput_limit: int = 10, + size: int = 1000 * 1024 * 1024 * 1024, + throughput_limit: int = 10 * 1024 * 1024 * 1024, files: Optional[dict] = None, ): super(HitrateStorage, self).__init__( name=name, sitename=sitename, - storagesize=storagesize, + size=size, throughput_limit=throughput_limit, files=files, ) From 721953ebc9ad5796c40b3c1f98370758a024882d Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Thu, 28 Nov 2019 22:40:23 +0100 Subject: [PATCH 438/648] ensured that size is always int --- lapis/storage_io/storage.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/lapis/storage_io/storage.py b/lapis/storage_io/storage.py index f6b42e3..8e17601 100644 --- a/lapis/storage_io/storage.py +++ b/lapis/storage_io/storage.py @@ -20,8 +20,9 @@ def storage_reader( storage_type, name=row["name"], sitename=row["sitename"], - storagesize=int(row["cachesizeGB"]) - * unit_conversion_mapping.get("cachesizeGB", 1), + size=int( + row["cachesizeGB"] * unit_conversion_mapping.get("cachesizeGB", 1) + ), throughput_limit=int(row["throughput_limit"]), files=storage_content[row["name"]], )() From fdd84824e0c565a14de97936a8e4a6a935285f49 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Thu, 28 Nov 2019 22:43:01 +0100 Subject: [PATCH 439/648] renamed method again --- lapis/connection.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapis/connection.py b/lapis/connection.py index 20c5777..8f7fd89 100644 --- a/lapis/connection.py +++ b/lapis/connection.py @@ -81,7 +81,7 @@ async def stream_file(self, requested_file: RequestedFile, dronesite, job_repr): ): try: potential_cache = random.choice(self.storages[dronesite]) - await potential_cache._apply_caching_decision(requested_file, job_repr) + await potential_cache.apply_caching_decision(requested_file, job_repr) except KeyError: pass print(f"now transfering {requested_file.filesize} from {used_connection}") From 94a1d9f28bd4b54e2ae35f338f4908effa7eff73 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Thu, 28 Nov 2019 22:47:32 +0100 Subject: [PATCH 440/648] fixed semmle issue --- lapis/simulator.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/lapis/simulator.py b/lapis/simulator.py index 4932af7..2bed938 100644 --- a/lapis/simulator.py +++ b/lapis/simulator.py @@ -21,7 +21,6 @@ configuration_information, job_events, ) -from lapis.monitor import Monitoring from lapis.monitor.cobald import drone_statistics, pool_statistics from lapis.pool import Pool @@ -38,7 +37,7 @@ def __init__(self, seed=1234): self.job_scheduler = None self.job_generator = None self._job_generators = [] - self.monitoring = Monitoring() + self.monitoring = monitor.Monitoring() self.duration = None def enable_monitoring(self): From 5d670f753574b41d673fc1f3aeea1b22196aca78 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Thu, 28 Nov 2019 23:44:20 +0100 Subject: [PATCH 441/648] added type hints --- lapis/utilities/cache_cleanup_implementations.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/lapis/utilities/cache_cleanup_implementations.py b/lapis/utilities/cache_cleanup_implementations.py index 19da640..0ddf493 100644 --- a/lapis/utilities/cache_cleanup_implementations.py +++ b/lapis/utilities/cache_cleanup_implementations.py @@ -1,4 +1,9 @@ -def sort_files_by_cachedsince(stored_files: set): +from typing import List + +from lapis.files import StoredFile + + +def sort_files_by_cachedsince(stored_files: set) -> List[StoredFile]: return sorted(list(stored_files), key=lambda x: x.cachedsince) From 3255ce632cfb2b120f2bfd5e2217f49e74090b8a Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Thu, 28 Nov 2019 23:46:11 +0100 Subject: [PATCH 442/648] removed cachealgorithm from storage and moved to connection --- lapis/cachealgorithm.py | 112 ++++++++++++++++++---------------------- lapis/connection.py | 28 +++++++++- lapis/storage.py | 37 +++---------- 3 files changed, 83 insertions(+), 94 deletions(-) diff --git a/lapis/cachealgorithm.py b/lapis/cachealgorithm.py index 0f4bb34..2e58758 100644 --- a/lapis/cachealgorithm.py +++ b/lapis/cachealgorithm.py @@ -1,67 +1,57 @@ -from typing import Optional, Set -from lapis.files import RequestedFile -from lapis.utilities.cache_algorithm_implementations import cache_algorithm +from typing import Optional, Callable, Tuple + +from lapis.files import RequestedFile, StoredFile +from lapis.storage import Storage from lapis.utilities.cache_cleanup_implementations import sort_files_by_cachedsince -class CacheAlgorithm(object): - def __init__(self, storage, additional_information: Optional[str] = None): - """ - Cache Algorithm class defining the handling of uncached files. - It's functionality is called via the consider() function. - :param storage: storage object that this algorithm is - :param additional_information: placeholder for additional external - information that might be passed to the cache algoritm. - """ - self._storage = storage - self._additional_information = additional_information +def check_size(file: RequestedFile, storage: Storage): + return storage.size >= file.filesize + + +def check_relevance(file: RequestedFile, storage: Storage): + return True + - def _file_based_consideration(self, candidate: RequestedFile) -> bool: - """ - File based caching decision: Checks if candidate file should be cached based on - conditions that apply to - file itself without considering the caches overall state. - :param candidate: - :return: - """ - if self._storage.storagesize > candidate.filesize: - return cache_algorithm["standard"](candidate) - else: - return False +def delete_oldest( + file: RequestedFile, storage: Storage +) -> Tuple[bool, Tuple[StoredFile]]: + deletable_files = [] + currently_free = storage.free_space + if currently_free < storage.free_space: + sorted_files = sort_files_by_cachedsince(storage.files.items()) + while currently_free < file.filesize: + deletable_files.append(next(sorted_files)) + currently_free += deletable_files[-1].filesize + return True, tuple(deletable_files) - def _context_based_consideration(self, candidate: RequestedFile): - """ - Caching decision based on the the overall context - :param candidate: - :return: - """ - to_be_removed = set() - sorted_stored_files = sort_files_by_cachedsince(self._storage.files) - current_free_storage = self._storage.free_space - for stored_file in sorted_stored_files: - if stored_file.numberofaccesses < 3: - to_be_removed.add(stored_file) - current_free_storage += stored_file.filesize - if current_free_storage >= candidate.filesize: - return to_be_removed - else: - continue - if current_free_storage >= candidate.filesize: - return {candidate} - def consider(self, candidate: RequestedFile) -> Optional[Set[RequestedFile]]: - """ - Decides whether the requested file should be cached. - The decision is split into a decision that is based on the - requested file only and a decision that takes the overall context (current - cache state, other cached files) into account. - :param candidate: - :return: - """ - if self._file_based_consideration(candidate): - if self._storage.free_space < candidate.filesize: - return self._context_based_consideration(candidate) - else: - return set() - else: - return {candidate} +def delete_oldest_few_used( + file: RequestedFile, storage: Storage +) -> Tuple[bool, Optional[Tuple[StoredFile]]]: + deletable_files = [] + currently_free = storage.free_space + if currently_free < storage.free_space: + sorted_files = sort_files_by_cachedsince(storage.files.items()) + for current_file in sorted_files: + if current_file.numberofaccesses < 3: + deletable_files.append(current_file) + currently_free += deletable_files[-1].filesize + if currently_free >= file.filesize: + return True, tuple(deletable_files) + return False, None + + +class CacheAlgorithm(object): + def __init__(self, caching_strategy: Callable, deletion_strategy: Callable): + self._caching_strategy = lambda file, storage: check_size( + file, storage + ) and check_relevance(file, storage) + self._deletion_strategy = lambda file, storage: delete_oldest(file, storage) + + def consider( + self, file: RequestedFile, storage: Storage + ) -> Tuple[bool, Optional[Tuple[StoredFile]]]: + if self._caching_strategy(file, storage): + return self._deletion_strategy(file, storage) + return False, None diff --git a/lapis/connection.py b/lapis/connection.py index 8f7fd89..6a28d87 100644 --- a/lapis/connection.py +++ b/lapis/connection.py @@ -3,6 +3,12 @@ from typing import Union from usim import Scope, time, Pipe +from lapis.cachealgorithm import ( + CacheAlgorithm, + check_size, + check_relevance, + delete_oldest_few_used, +) from lapis.storage import Storage, RemoteStorage from lapis.files import RequestedFile from lapis.monitor import sampling_required @@ -10,11 +16,18 @@ class Connection(object): - __slots__ = ("storages", "remote_connection") + __slots__ = ("storages", "remote_connection", "caching_algorithm") def __init__(self, throughput=100): self.storages = dict() self.remote_connection = RemoteStorage(Pipe(throughput=throughput)) + self.caching_algorithm = CacheAlgorithm( + caching_strategy=lambda file, storage: check_size(file, storage) + and check_relevance(file, storage), + deletion_strategy=lambda file, storage: delete_oldest_few_used( + file, storage + ), + ) def add_storage_element(self, storage_element: Storage): """ @@ -81,7 +94,18 @@ async def stream_file(self, requested_file: RequestedFile, dronesite, job_repr): ): try: potential_cache = random.choice(self.storages[dronesite]) - await potential_cache.apply_caching_decision(requested_file, job_repr) + cache_file, files_for_deletion = self.caching_algorithm.consider( + file=requested_file, storage=potential_cache + ) + if cache_file: + for file in files_for_deletion: + await potential_cache.remove(file, job_repr) + await potential_cache.add(requested_file, job_repr) + else: + print( + f"APPLY CACHING DECISION: Job {job_repr}, File {requested_file.filename}: File wasnt " + f"cached @ {time.now}" + ) except KeyError: pass print(f"now transfering {requested_file.filesize} from {used_connection}") diff --git a/lapis/storage.py b/lapis/storage.py index be50b10..567945f 100644 --- a/lapis/storage.py +++ b/lapis/storage.py @@ -3,7 +3,6 @@ from usim import time, Resources, Pipe, Scope from lapis.files import StoredFile, RequestedFile -from lapis.cachealgorithm import CacheAlgorithm class LookUpInformation(NamedTuple): @@ -30,7 +29,6 @@ class Storage(object): "_usedstorage", "files", "filenames", - "cachealgorithm", "connection", "remote_connection", ) @@ -52,7 +50,6 @@ def __init__( self._usedstorage = Resources( size=sum(file.filesize for file in files.values()) ) - self.cachealgorithm = CacheAlgorithm(self) self.connection = Pipe(throughput_limit) self.remote_connection = None @@ -134,34 +131,6 @@ async def transfer(self, file, job_repr): except AttributeError: pass - async def apply_caching_decision(self, requested_file: RequestedFile, job_repr): - """ - Applies the storage objects caching algorithm to the requested_file and - initiates resulting changes like placement and deletion of files - :param requested_file: - :param job_repr: Needed for debug output, will be replaced - :return: - """ - - print( - "APPLY CACHING DECISION: Job {}, File {} @ {}".format( - job_repr, requested_file.filename, time.now - ) - ) - to_be_removed = self.cachealgorithm.consider(requested_file) - if not to_be_removed: - await self.add(requested_file, job_repr) - elif to_be_removed == {requested_file}: - # file will not be cached because it either does not match - # conditions or because there is no space in the cache - print( - "APPLY CACHING DECISION: Job {}, File {}: File wasnt " - "cached @ {}".format(job_repr, requested_file.filename, time.now) - ) - else: - for file in to_be_removed: - await self.remove(file, job_repr) - def look_up_file(self, requested_file: RequestedFile, job_repr): """ Searches storage object for the requested_file and sends result (amount of @@ -220,3 +189,9 @@ async def transfer(self, file, job_repr): def look_up_file(self, requested_file: RequestedFile, job_repr): return LookUpInformation(requested_file.filesize) + + async def add(self, file: RequestedFile, job_repr): + pass + + async def remove(self, file: StoredFile, job_repr): + pass From 908bae101e387ec4e97030dcfc25811058c0e062 Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Fri, 29 Nov 2019 07:13:16 +0100 Subject: [PATCH 443/648] fixed bug leading to full RAM again --- lapis/storage_io/storage.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapis/storage_io/storage.py b/lapis/storage_io/storage.py index 8e17601..cbd43d0 100644 --- a/lapis/storage_io/storage.py +++ b/lapis/storage_io/storage.py @@ -21,7 +21,7 @@ def storage_reader( name=row["name"], sitename=row["sitename"], size=int( - row["cachesizeGB"] * unit_conversion_mapping.get("cachesizeGB", 1) + int(row["cachesizeGB"]) * unit_conversion_mapping.get("cachesizeGB", 1) ), throughput_limit=int(row["throughput_limit"]), files=storage_content[row["name"]], From 76b868e99bd90fdf10d6b96264949d37c64d5021 Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Fri, 29 Nov 2019 07:13:16 +0100 Subject: [PATCH 444/648] fixed bug leading to full RAM again --- lapis/storage_io/storage.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lapis/storage_io/storage.py b/lapis/storage_io/storage.py index cbd43d0..f09a9da 100644 --- a/lapis/storage_io/storage.py +++ b/lapis/storage_io/storage.py @@ -21,7 +21,8 @@ def storage_reader( name=row["name"], sitename=row["sitename"], size=int( - int(row["cachesizeGB"]) * unit_conversion_mapping.get("cachesizeGB", 1) + float(row["cachesizeGB"]) + * unit_conversion_mapping.get("cachesizeGB", 1) ), throughput_limit=int(row["throughput_limit"]), files=storage_content[row["name"]], From 00fc4d9e62e6824ded8cb2c8ca79edf8d68011ee Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Fri, 29 Nov 2019 09:19:46 +0100 Subject: [PATCH 445/648] fix hit rate based caching functionality --- lapis/connection.py | 7 ++----- lapis/storage.py | 10 ++++++---- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/lapis/connection.py b/lapis/connection.py index 6a28d87..fecaed3 100644 --- a/lapis/connection.py +++ b/lapis/connection.py @@ -63,9 +63,7 @@ async def _determine_inputfile_source( for storage in provided_storages: look_up_list.append(storage.look_up_file(requested_file, job_repr)) storage_list = sorted( - [entry async for entry in look_up_list], - key=lambda x: x[0], - reverse=True, + [entry for entry in look_up_list], key=lambda x: x[0], reverse=True ) for entry in storage_list: # TODO: check should better check that size is bigger than requested @@ -87,7 +85,6 @@ async def stream_file(self, requested_file: RequestedFile, dronesite, job_repr): used_connection = await self._determine_inputfile_source( requested_file, dronesite, job_repr ) - await sampling_required.put(used_connection) if used_connection == self.remote_connection and self.storages.get( dronesite, None @@ -130,7 +127,7 @@ async def transfer_files(self, drone, requested_files: dict, job_repr): async with Scope() as scope: for inputfilename, inputfilespecs in requested_files.items(): requested_file = RequestedFile( - inputfilename, inputfilespecs["filesize"] + inputfilename, inputfilespecs["usedsize"] ) scope.do(self.stream_file(requested_file, drone.sitename, job_repr)) stream_time = time.now - start_time diff --git a/lapis/storage.py b/lapis/storage.py index 567945f..3155980 100644 --- a/lapis/storage.py +++ b/lapis/storage.py @@ -14,8 +14,8 @@ class RemoteStorage(object): def __init__(self, pipe: Pipe): self._connection = pipe - async def transfer(self, file, job_repr): - await self._connection.transfer(total=file.filesize) + async def transfer(self, total, job_repr): + await self._connection.transfer(total=total) class Storage(object): @@ -184,11 +184,13 @@ async def transfer(self, file, job_repr): async with Scope() as scope: scope.do(self.connection.transfer(total=self._hitrate * file.filesize)) scope.do( - self.remote_connection.transfer(total=1 - self._hitrate * file.filesize) + self.remote_connection.transfer( + total=(1 - self._hitrate) * file.filesize, job_repr=job_repr + ) ) def look_up_file(self, requested_file: RequestedFile, job_repr): - return LookUpInformation(requested_file.filesize) + return LookUpInformation(requested_file.filesize, self) async def add(self, file: RequestedFile, job_repr): pass From 8cb8c755e9001789ba0b661aa7fc172728183418 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Fri, 29 Nov 2019 09:41:17 +0100 Subject: [PATCH 446/648] added first test for storage io --- lapis/storage_io/storage.py | 3 +-- lapis_tests/storage_io/__init__.py | 0 lapis_tests/storage_io/test_storage.py | 22 ++++++++++++++++++++++ 3 files changed, 23 insertions(+), 2 deletions(-) create mode 100644 lapis_tests/storage_io/__init__.py create mode 100644 lapis_tests/storage_io/test_storage.py diff --git a/lapis/storage_io/storage.py b/lapis/storage_io/storage.py index f09a9da..e9a528f 100644 --- a/lapis/storage_io/storage.py +++ b/lapis/storage_io/storage.py @@ -41,8 +41,7 @@ def storage_content_reader( for row in reader: for key in row: if key not in ["filename", "cachename"]: - row[key] = int(row[key]) - row[key] = row[key] * unit_conversion_mapping.get(key, 1) + row[key] = int(float(row[key]) * unit_conversion_mapping.get(key, 1)) cache_information.setdefault(row["cachename"], {})[ row["filename"] ] = StoredFile(**row) diff --git a/lapis_tests/storage_io/__init__.py b/lapis_tests/storage_io/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/lapis_tests/storage_io/test_storage.py b/lapis_tests/storage_io/test_storage.py new file mode 100644 index 0000000..8480c23 --- /dev/null +++ b/lapis_tests/storage_io/test_storage.py @@ -0,0 +1,22 @@ +from tempfile import NamedTemporaryFile + +from lapis.storage import Storage +from lapis.storage_io.storage import storage_reader + + +class TestStorageReader(object): + def _create_simple_config(self): + storage_config = NamedTemporaryFile(suffix=".csv") + with open(storage_config.name, "w") as write_stream: + write_stream.write( + "name sitename cachesizeGB throughput_limit\n" "name sitename 10 10" + ) + return storage_config + + def test_empty_files(self): + simple_config = self._create_simple_config() + count = 0 + for storage in storage_reader(open(simple_config.name, "r"), None, Storage): + assert storage is not None + count += 1 + assert count == 1 From 224fda42fc23834ec78126d01b4f540724ef5ea5 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Fri, 29 Nov 2019 11:11:35 +0100 Subject: [PATCH 447/648] adapted access to connection for RemoteStorage --- lapis/storage.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/lapis/storage.py b/lapis/storage.py index 3155980..59faf54 100644 --- a/lapis/storage.py +++ b/lapis/storage.py @@ -12,10 +12,10 @@ class LookUpInformation(NamedTuple): class RemoteStorage(object): def __init__(self, pipe: Pipe): - self._connection = pipe + self.connection = pipe async def transfer(self, total, job_repr): - await self._connection.transfer(total=total) + await self.connection.transfer(total=total) class Storage(object): @@ -30,7 +30,7 @@ class Storage(object): "files", "filenames", "connection", - "remote_connection", + "remote_storage", ) def __init__( @@ -51,7 +51,7 @@ def __init__( size=sum(file.filesize for file in files.values()) ) self.connection = Pipe(throughput_limit) - self.remote_connection = None + self.remote_storage = None @property def usedstorage(self): @@ -184,7 +184,7 @@ async def transfer(self, file, job_repr): async with Scope() as scope: scope.do(self.connection.transfer(total=self._hitrate * file.filesize)) scope.do( - self.remote_connection.transfer( + self.remote_storage.connection.transfer( total=(1 - self._hitrate) * file.filesize, job_repr=job_repr ) ) From 98856f7c3c6014c4369b192cdd014b0b976eee07 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Fri, 29 Nov 2019 11:16:50 +0100 Subject: [PATCH 448/648] added new test for storage input --- lapis_tests/storage_io/test_storage.py | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/lapis_tests/storage_io/test_storage.py b/lapis_tests/storage_io/test_storage.py index 8480c23..62f9bea 100644 --- a/lapis_tests/storage_io/test_storage.py +++ b/lapis_tests/storage_io/test_storage.py @@ -9,10 +9,19 @@ def _create_simple_config(self): storage_config = NamedTemporaryFile(suffix=".csv") with open(storage_config.name, "w") as write_stream: write_stream.write( - "name sitename cachesizeGB throughput_limit\n" "name sitename 10 10" + "name sitename cachesizeGB throughput_limit\n" "name sitename 10.1 1" ) return storage_config + def _create_simple_files(self): + file_config = NamedTemporaryFile(suffix=".csv") + with open(file_config.name, "w") as write_stream: + write_stream.write( + "filename cachename filesize storedsize cachedsince lastaccessed numberofaccesses\n" + "file name 10.1 5.0 0 0 1" + ) + return file_config + def test_empty_files(self): simple_config = self._create_simple_config() count = 0 @@ -20,3 +29,18 @@ def test_empty_files(self): assert storage is not None count += 1 assert count == 1 + + def test_simple_read(self): + simple_config = self._create_simple_config() + simple_files = self._create_simple_files() + count = 0 + for storage in storage_reader( + open(simple_config.name, "r"), open(simple_files.name, "r"), Storage + ): + assert storage is not None + assert type(storage.free_space) == int + assert storage.free_space == int(5.1 * 1024 * 1024 * 1024) + assert type(storage.size) == int + assert storage.size == int(10.1 * 1024 * 1024 * 1024) + count += 1 + assert count == 1 From 421c8476977a6d202894a57d9468077053d926f4 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Fri, 29 Nov 2019 11:17:57 +0100 Subject: [PATCH 449/648] fixed assignment of remote storage --- lapis/connection.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapis/connection.py b/lapis/connection.py index fecaed3..ee75157 100644 --- a/lapis/connection.py +++ b/lapis/connection.py @@ -36,7 +36,7 @@ def add_storage_element(self, storage_element: Storage): :param storage_element: :return: """ - storage_element.remote_connection = self.remote_connection + storage_element.remote_storage = self.remote_connection try: self.storages[storage_element.sitename].append(storage_element) except KeyError: From 9510efebbed45bb256525fb02ee66d5ea921c66b Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Fri, 29 Nov 2019 11:30:14 +0100 Subject: [PATCH 450/648] reverted change of transfer signature and added typehints --- lapis/storage.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lapis/storage.py b/lapis/storage.py index 59faf54..4a5c816 100644 --- a/lapis/storage.py +++ b/lapis/storage.py @@ -14,8 +14,8 @@ class RemoteStorage(object): def __init__(self, pipe: Pipe): self.connection = pipe - async def transfer(self, total, job_repr): - await self.connection.transfer(total=total) + async def transfer(self, file: RequestedFile, job_repr): + await self.connection.transfer(total=file.filesize) class Storage(object): @@ -115,7 +115,7 @@ async def update_file(self, stored_file: StoredFile, job_repr): ) ) - async def transfer(self, file, job_repr): + async def transfer(self, file: RequestedFile, job_repr): """ Manages file transfer via the storage elements connection and updates file information. If the file should have been deleted since it was originally From c315c2f50b8553ce9f0ee49614e979abb92b56f6 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Fri, 29 Nov 2019 12:06:53 +0100 Subject: [PATCH 451/648] introduced interface for storage --- lapis/cachealgorithm.py | 20 +++---- lapis/cli/simulate.py | 4 +- lapis/connection.py | 8 +-- lapis/interfaces/_storage.py | 47 ++++++++++++++++ lapis/monitor/caching.py | 12 ++-- lapis/{storage.py => storageelement.py} | 75 ++++++++++++++++++------- lapis_tests/storage_io/test_storage.py | 12 ++-- 7 files changed, 131 insertions(+), 47 deletions(-) create mode 100644 lapis/interfaces/_storage.py rename lapis/{storage.py => storageelement.py} (80%) diff --git a/lapis/cachealgorithm.py b/lapis/cachealgorithm.py index 2e58758..caa0d66 100644 --- a/lapis/cachealgorithm.py +++ b/lapis/cachealgorithm.py @@ -1,24 +1,24 @@ from typing import Optional, Callable, Tuple from lapis.files import RequestedFile, StoredFile -from lapis.storage import Storage +from lapis.storageelement import StorageElement from lapis.utilities.cache_cleanup_implementations import sort_files_by_cachedsince -def check_size(file: RequestedFile, storage: Storage): +def check_size(file: RequestedFile, storage: StorageElement): return storage.size >= file.filesize -def check_relevance(file: RequestedFile, storage: Storage): +def check_relevance(file: RequestedFile, storage: StorageElement): return True def delete_oldest( - file: RequestedFile, storage: Storage + file: RequestedFile, storage: StorageElement ) -> Tuple[bool, Tuple[StoredFile]]: deletable_files = [] - currently_free = storage.free_space - if currently_free < storage.free_space: + currently_free = storage.available + if currently_free < storage.available: sorted_files = sort_files_by_cachedsince(storage.files.items()) while currently_free < file.filesize: deletable_files.append(next(sorted_files)) @@ -27,11 +27,11 @@ def delete_oldest( def delete_oldest_few_used( - file: RequestedFile, storage: Storage + file: RequestedFile, storage: StorageElement ) -> Tuple[bool, Optional[Tuple[StoredFile]]]: deletable_files = [] - currently_free = storage.free_space - if currently_free < storage.free_space: + currently_free = storage.available + if currently_free < storage.available: sorted_files = sort_files_by_cachedsince(storage.files.items()) for current_file in sorted_files: if current_file.numberofaccesses < 3: @@ -50,7 +50,7 @@ def __init__(self, caching_strategy: Callable, deletion_strategy: Callable): self._deletion_strategy = lambda file, storage: delete_oldest(file, storage) def consider( - self, file: RequestedFile, storage: Storage + self, file: RequestedFile, storage: StorageElement ) -> Tuple[bool, Optional[Tuple[StoredFile]]]: if self._caching_strategy(file, storage): return self._deletion_strategy(file, storage) diff --git a/lapis/cli/simulate.py b/lapis/cli/simulate.py index d2a1701..d5e6cf6 100644 --- a/lapis/cli/simulate.py +++ b/lapis/cli/simulate.py @@ -11,7 +11,7 @@ from lapis.pool import StaticPool, Pool from lapis.pool_io.htcondor import htcondor_pool_reader from lapis.job_io.swf import swf_job_reader -from lapis.storage import Storage, HitrateStorage +from lapis.storageelement import StorageElement, HitrateStorage from lapis.storage_io.storage import storage_reader from lapis.scheduler import CondorJobScheduler @@ -114,7 +114,7 @@ def static(ctx, job_file, pool_file, storage_files, remote_throughput, cache_hit storage_reader=storage_import_mapper[storage_type], storage_type=partial(HitrateStorage, cache_hitrate) if cache_hitrate is not None - else Storage, + else StorageElement, ) for current_pool in pool_file: pool_file, pool_file_type = current_pool diff --git a/lapis/connection.py b/lapis/connection.py index ee75157..72432d5 100644 --- a/lapis/connection.py +++ b/lapis/connection.py @@ -9,7 +9,7 @@ check_relevance, delete_oldest_few_used, ) -from lapis.storage import Storage, RemoteStorage +from lapis.storageelement import StorageElement, RemoteStorage from lapis.files import RequestedFile from lapis.monitor import sampling_required @@ -29,7 +29,7 @@ def __init__(self, throughput=100): ), ) - def add_storage_element(self, storage_element: Storage): + def add_storage_element(self, storage_element: StorageElement): """ Register storage element in Connetion module clustering storage elements by sitename @@ -44,7 +44,7 @@ def add_storage_element(self, storage_element: Storage): async def _determine_inputfile_source( self, requested_file: RequestedFile, dronesite: str, job_repr: str - ) -> Union[Storage, RemoteStorage]: + ) -> Union[StorageElement, RemoteStorage]: """ Collects NamedTuples containing the amount of data of the requested file cached in a storage element and the storage element for all reachable storage @@ -61,7 +61,7 @@ async def _determine_inputfile_source( if provided_storages is not None: look_up_list = [] for storage in provided_storages: - look_up_list.append(storage.look_up_file(requested_file, job_repr)) + look_up_list.append(storage.find(requested_file, job_repr)) storage_list = sorted( [entry for entry in look_up_list], key=lambda x: x[0], reverse=True ) diff --git a/lapis/interfaces/_storage.py b/lapis/interfaces/_storage.py new file mode 100644 index 0000000..f092f6d --- /dev/null +++ b/lapis/interfaces/_storage.py @@ -0,0 +1,47 @@ +import abc + +from typing import NamedTuple + +from lapis.files import RequestedFile, StoredFile + + +class LookUpInformation(NamedTuple): + cached_filesize: int + storage: "Storage" + + +class Storage(metaclass=abc.ABCMeta): + @property + @abc.abstractmethod + def size(self): + raise NotImplementedError + + @property + @abc.abstractmethod + def available(self): + raise NotImplementedError + + @property + @abc.abstractmethod + def used(self): + raise NotImplementedError + + @abc.abstractmethod + async def transfer(self, file: RequestedFile, job_repr): + raise NotImplementedError + + @abc.abstractmethod + async def add(self, file: RequestedFile, job_repr): + raise NotImplementedError + + @abc.abstractmethod + async def remove(self, file: StoredFile, job_repr): + raise NotImplementedError + + @abc.abstractmethod + async def update(self, file: StoredFile, job_repr): + raise NotImplementedError + + @abc.abstractmethod + def find(self, file: RequestedFile, job_repr) -> LookUpInformation: + raise NotImplementedError diff --git a/lapis/monitor/caching.py b/lapis/monitor/caching.py index 604c748..22ca822 100644 --- a/lapis/monitor/caching.py +++ b/lapis/monitor/caching.py @@ -5,10 +5,10 @@ from usim import Pipe from lapis.monitor import LoggingSocketHandler, LoggingUDPSocketHandler -from lapis.storage import Storage +from lapis.storageelement import StorageElement -def storage_status(storage: Storage) -> list: +def storage_status(storage: StorageElement) -> list: """ Log information about current storage object state :param storage: @@ -17,7 +17,7 @@ def storage_status(storage: Storage) -> list: results = [ { "storage": repr(storage), - "usedstorage": storage.usedstorage, + "usedstorage": storage.used, "storagesize": storage.size, "numberoffiles": len(storage.files), } @@ -26,7 +26,7 @@ def storage_status(storage: Storage) -> list: storage_status.name = "storage_status" -storage_status.whitelist = (Storage,) +storage_status.whitelist = (StorageElement,) storage_status.logging_formatter = { LoggingSocketHandler.__name__: JsonFormatter(), logging.StreamHandler.__name__: JsonFormatter(), @@ -36,7 +36,7 @@ def storage_status(storage: Storage) -> list: } -def storage_connection(storage: Storage) -> list: +def storage_connection(storage: StorageElement) -> list: """ Log information about the storages connection :param storage: @@ -54,7 +54,7 @@ def storage_connection(storage: Storage) -> list: storage_connection.name = "storage_connection" -storage_connection.whitelist = (Storage,) +storage_connection.whitelist = (StorageElement,) storage_connection.logging_formatter = { LoggingSocketHandler.__name__: JsonFormatter(), logging.StreamHandler.__name__: JsonFormatter(), diff --git a/lapis/storage.py b/lapis/storageelement.py similarity index 80% rename from lapis/storage.py rename to lapis/storageelement.py index 4a5c816..1640068 100644 --- a/lapis/storage.py +++ b/lapis/storageelement.py @@ -1,29 +1,49 @@ -from typing import Optional, NamedTuple +from typing import Optional from usim import time, Resources, Pipe, Scope from lapis.files import StoredFile, RequestedFile +from lapis.interfaces._storage import Storage, LookUpInformation -class LookUpInformation(NamedTuple): - cached_filesize: int - storage: "Storage" - - -class RemoteStorage(object): +class RemoteStorage(Storage): def __init__(self, pipe: Pipe): self.connection = pipe - async def transfer(self, file: RequestedFile, job_repr): + @property + def size(self): + return float("Inf") + + @property + def available(self): + return float("Inf") + + @property + def used(self): + return 0 + + async def transfer(self, file: RequestedFile, **kwargs): await self.connection.transfer(total=file.filesize) + async def add(self, file: StoredFile, **kwargs): + raise NotImplementedError + + async def remove(self, file: StoredFile, **kwargs): + raise NotImplementedError + + async def update(self, file: StoredFile, **kwargs): + raise NotImplementedError -class Storage(object): + def find(self, file: RequestedFile, **kwargs) -> LookUpInformation: + raise NotImplementedError + + +class StorageElement(Storage): __slots__ = ( "name", "sitename", - "size", + "_size", "deletion_duration", "update_duration", "_usedstorage", @@ -45,7 +65,7 @@ def __init__( self.sitename = sitename self.deletion_duration = 5 self.update_duration = 1 - self.size = size + self._size = size self.files = files self._usedstorage = Resources( size=sum(file.filesize for file in files.values()) @@ -54,12 +74,16 @@ def __init__( self.remote_storage = None @property - def usedstorage(self): + def size(self): + return self._size + + @property + def used(self): return self._usedstorage.levels.size @property - def free_space(self): - return self.size - self.usedstorage + def available(self): + return self.size - self.used async def remove(self, file: StoredFile, job_repr): """ @@ -99,7 +123,7 @@ async def add(self, file: RequestedFile, job_repr): self.files[file.filename] = file await self.connection.transfer(file.filesize) - async def update_file(self, stored_file: StoredFile, job_repr): + async def update(self, stored_file: StoredFile, job_repr): """ Updates a stored files information upon access. :param stored_file: @@ -127,11 +151,11 @@ async def transfer(self, file: RequestedFile, job_repr): await self.connection.transfer(file.filesize) try: # TODO: needs handling of KeyError - await self.update_file(self.files[file.filename], job_repr) + await self.update(self.files[file.filename], job_repr) except AttributeError: pass - def look_up_file(self, requested_file: RequestedFile, job_repr): + def find(self, requested_file: RequestedFile, job_repr): """ Searches storage object for the requested_file and sends result (amount of cached data, storage object) to the queue. @@ -161,7 +185,7 @@ def __repr__(self): return "<%s: %s>" % (self.__class__.__name__, self.name or id(self)) -class HitrateStorage(Storage): +class HitrateStorage(StorageElement): def __init__( self, hitrate, @@ -180,7 +204,15 @@ def __init__( ) self._hitrate = hitrate - async def transfer(self, file, job_repr): + @property + def available(self): + return self.size + + @property + def used(self): + return 0 + + async def transfer(self, file: RequestedFile, job_repr): async with Scope() as scope: scope.do(self.connection.transfer(total=self._hitrate * file.filesize)) scope.do( @@ -189,7 +221,7 @@ async def transfer(self, file, job_repr): ) ) - def look_up_file(self, requested_file: RequestedFile, job_repr): + def find(self, requested_file: RequestedFile, job_repr): return LookUpInformation(requested_file.filesize, self) async def add(self, file: RequestedFile, job_repr): @@ -197,3 +229,6 @@ async def add(self, file: RequestedFile, job_repr): async def remove(self, file: StoredFile, job_repr): pass + + async def update(self, stored_file: StoredFile, job_repr): + pass diff --git a/lapis_tests/storage_io/test_storage.py b/lapis_tests/storage_io/test_storage.py index 62f9bea..758db4d 100644 --- a/lapis_tests/storage_io/test_storage.py +++ b/lapis_tests/storage_io/test_storage.py @@ -1,6 +1,6 @@ from tempfile import NamedTemporaryFile -from lapis.storage import Storage +from lapis.storageelement import StorageElement from lapis.storage_io.storage import storage_reader @@ -25,7 +25,9 @@ def _create_simple_files(self): def test_empty_files(self): simple_config = self._create_simple_config() count = 0 - for storage in storage_reader(open(simple_config.name, "r"), None, Storage): + for storage in storage_reader( + open(simple_config.name, "r"), None, StorageElement + ): assert storage is not None count += 1 assert count == 1 @@ -35,11 +37,11 @@ def test_simple_read(self): simple_files = self._create_simple_files() count = 0 for storage in storage_reader( - open(simple_config.name, "r"), open(simple_files.name, "r"), Storage + open(simple_config.name, "r"), open(simple_files.name, "r"), StorageElement ): assert storage is not None - assert type(storage.free_space) == int - assert storage.free_space == int(5.1 * 1024 * 1024 * 1024) + assert type(storage.available) == int + assert storage.available == int(5.1 * 1024 * 1024 * 1024) assert type(storage.size) == int assert storage.size == int(10.1 * 1024 * 1024 * 1024) count += 1 From f57a4cdf687a84726201f349ef4b27d21151c6ce Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Fri, 29 Nov 2019 12:15:51 +0100 Subject: [PATCH 452/648] added docstrings to storage interface --- lapis/interfaces/_storage.py | 29 ++++++++++++++++++++++++++--- 1 file changed, 26 insertions(+), 3 deletions(-) diff --git a/lapis/interfaces/_storage.py b/lapis/interfaces/_storage.py index f092f6d..ae37093 100644 --- a/lapis/interfaces/_storage.py +++ b/lapis/interfaces/_storage.py @@ -13,35 +13,58 @@ class LookUpInformation(NamedTuple): class Storage(metaclass=abc.ABCMeta): @property @abc.abstractmethod - def size(self): + def size(self) -> int: + """Total size of storage in Bytes""" raise NotImplementedError @property @abc.abstractmethod - def available(self): + def available(self) -> int: + """Available storage in Bytes""" raise NotImplementedError @property @abc.abstractmethod - def used(self): + def used(self) -> int: + """Used storage in Bytes""" raise NotImplementedError @abc.abstractmethod async def transfer(self, file: RequestedFile, job_repr): + """ + Transfer size of given file via the storages' connection and update file + information. If the file was deleted since it was originally looked up + the resulting error is not raised. + + .. TODO:: What does this mean with the error? + """ raise NotImplementedError @abc.abstractmethod async def add(self, file: RequestedFile, job_repr): + """ + Add file information to storage and transfer the size of the file via + the storages' connection. + """ raise NotImplementedError @abc.abstractmethod async def remove(self, file: StoredFile, job_repr): + """ + Remove all file information and used filesize from the storage. + """ raise NotImplementedError @abc.abstractmethod async def update(self, file: StoredFile, job_repr): + """ + Updates a stored files information upon access. + + .. TODO:: This should be included in an operation to access/transfer. + """ raise NotImplementedError @abc.abstractmethod def find(self, file: RequestedFile, job_repr) -> LookUpInformation: + """Information if a file is stored in Storage""" raise NotImplementedError From 05a854c3d95a8b279ee8f7960285bc17451c2426 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Fri, 29 Nov 2019 12:30:50 +0100 Subject: [PATCH 453/648] extended tests --- lapis_tests/storage_io/test_storage.py | 41 +++++++++++++++----------- 1 file changed, 23 insertions(+), 18 deletions(-) diff --git a/lapis_tests/storage_io/test_storage.py b/lapis_tests/storage_io/test_storage.py index 758db4d..7cdbe17 100644 --- a/lapis_tests/storage_io/test_storage.py +++ b/lapis_tests/storage_io/test_storage.py @@ -5,20 +5,21 @@ class TestStorageReader(object): - def _create_simple_config(self): + def _create_simple_config(self, to_string=False): storage_config = NamedTemporaryFile(suffix=".csv") with open(storage_config.name, "w") as write_stream: write_stream.write( - "name sitename cachesizeGB throughput_limit\n" "name sitename 10.1 1" + f"name sitename cachesizeGB throughput_limit\n" + f"name sitename {str(10) if to_string else 10} {str(10.1) if to_string else 10.1} {str(1) if to_string else 1}" ) return storage_config - def _create_simple_files(self): + def _create_simple_files(self, to_string=False): file_config = NamedTemporaryFile(suffix=".csv") with open(file_config.name, "w") as write_stream: write_stream.write( - "filename cachename filesize storedsize cachedsince lastaccessed numberofaccesses\n" - "file name 10.1 5.0 0 0 1" + f"filename cachename filesize storedsize cachedsince lastaccessed numberofaccesses\n" + f"file name {str(10.1) if to_string else 10.1} {str(5.0) if to_string else 5.0} {str(0) if to_string else 0} {str(0) if to_string else 0} {str(1) if to_string else 1}" ) return file_config @@ -33,16 +34,20 @@ def test_empty_files(self): assert count == 1 def test_simple_read(self): - simple_config = self._create_simple_config() - simple_files = self._create_simple_files() - count = 0 - for storage in storage_reader( - open(simple_config.name, "r"), open(simple_files.name, "r"), StorageElement - ): - assert storage is not None - assert type(storage.available) == int - assert storage.available == int(5.1 * 1024 * 1024 * 1024) - assert type(storage.size) == int - assert storage.size == int(10.1 * 1024 * 1024 * 1024) - count += 1 - assert count == 1 + for variant in [False, True]: + print(f"starting with {variant}") + simple_config = self._create_simple_config(to_string=variant) + simple_files = self._create_simple_files(to_string=variant) + count = 0 + for storage in storage_reader( + open(simple_config.name, "r"), + open(simple_files.name, "r"), + StorageElement, + ): + assert storage is not None + assert type(storage.available) == int + assert storage.available == int(5.1 * 1024 * 1024 * 1024) + assert type(storage.size) == int + assert storage.size == int(10.1 * 1024 * 1024 * 1024) + count += 1 + assert count == 1 From 448d69164533531b4777bb91455a7be2edc557d7 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Fri, 29 Nov 2019 12:37:20 +0100 Subject: [PATCH 454/648] removed public update method from storage and made update private --- lapis/interfaces/_storage.py | 9 --------- lapis/storageelement.py | 10 ++-------- 2 files changed, 2 insertions(+), 17 deletions(-) diff --git a/lapis/interfaces/_storage.py b/lapis/interfaces/_storage.py index ae37093..78f7dfc 100644 --- a/lapis/interfaces/_storage.py +++ b/lapis/interfaces/_storage.py @@ -55,15 +55,6 @@ async def remove(self, file: StoredFile, job_repr): """ raise NotImplementedError - @abc.abstractmethod - async def update(self, file: StoredFile, job_repr): - """ - Updates a stored files information upon access. - - .. TODO:: This should be included in an operation to access/transfer. - """ - raise NotImplementedError - @abc.abstractmethod def find(self, file: RequestedFile, job_repr) -> LookUpInformation: """Information if a file is stored in Storage""" diff --git a/lapis/storageelement.py b/lapis/storageelement.py index 1640068..8a27e91 100644 --- a/lapis/storageelement.py +++ b/lapis/storageelement.py @@ -31,9 +31,6 @@ async def add(self, file: StoredFile, **kwargs): async def remove(self, file: StoredFile, **kwargs): raise NotImplementedError - async def update(self, file: StoredFile, **kwargs): - raise NotImplementedError - def find(self, file: RequestedFile, **kwargs) -> LookUpInformation: raise NotImplementedError @@ -123,7 +120,7 @@ async def add(self, file: RequestedFile, job_repr): self.files[file.filename] = file await self.connection.transfer(file.filesize) - async def update(self, stored_file: StoredFile, job_repr): + async def _update(self, stored_file: StoredFile, job_repr): """ Updates a stored files information upon access. :param stored_file: @@ -151,7 +148,7 @@ async def transfer(self, file: RequestedFile, job_repr): await self.connection.transfer(file.filesize) try: # TODO: needs handling of KeyError - await self.update(self.files[file.filename], job_repr) + await self._update(self.files[file.filename], job_repr) except AttributeError: pass @@ -229,6 +226,3 @@ async def add(self, file: RequestedFile, job_repr): async def remove(self, file: StoredFile, job_repr): pass - - async def update(self, stored_file: StoredFile, job_repr): - pass From 81ce9ac760c34952f0a3095d96a2e2b4a88de7ea Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Fri, 29 Nov 2019 17:32:12 +0100 Subject: [PATCH 455/648] added unit conversion for storageelement connection and remote connection throughput_limit, filesize, usedsize --- lapis/cli/simulate.py | 2 +- lapis/job_io/htcondor.py | 9 +++++++++ lapis/storage_io/storage.py | 5 ++++- 3 files changed, 14 insertions(+), 2 deletions(-) diff --git a/lapis/cli/simulate.py b/lapis/cli/simulate.py index d5e6cf6..619e55f 100644 --- a/lapis/cli/simulate.py +++ b/lapis/cli/simulate.py @@ -106,7 +106,7 @@ def static(ctx, job_file, pool_file, storage_files, remote_throughput, cache_hit simulator.create_scheduler(scheduler_type=CondorJobScheduler) if all(storage_files): - simulator.create_connection_module(remote_throughput) + simulator.create_connection_module(remote_throughput * 1024 * 1024 * 1024) storage_file, storage_content_file, storage_type = storage_files simulator.create_storage( storage_input=storage_file, diff --git a/lapis/job_io/htcondor.py b/lapis/job_io/htcondor.py index c95183d..3173a39 100644 --- a/lapis/job_io/htcondor.py +++ b/lapis/job_io/htcondor.py @@ -31,6 +31,8 @@ def htcondor_job_reader( "RemoteWallClockTime": 1, "MemoryUsage": 1000 * 1000, "DiskUsage_RAW": 1024, + "filesize": 1024 * 1024 * 1024, + "usedsize": 1024 * 1024 * 1024, }, ): input_file_type = iterable.name.split(".")[-1].lower() @@ -78,8 +80,15 @@ def htcondor_job_reader( resources["inputfiles"] = deepcopy(entry["Inputfiles"]) used_resources["inputfiles"] = deepcopy(entry["Inputfiles"]) for filename, filespecs in entry["Inputfiles"].items(): + for key in filespecs.keys(): + filespecs[key] = filespecs[key] * unit_conversion_mapping.get( + key, 1 + ) + + print(filespecs) if "usedsize" in filespecs: del resources["inputfiles"][filename]["usedsize"] + if "filesize" in filespecs: if "usedsize" not in filespecs: used_resources["inputfiles"][filename]["usedsize"] = filespecs[ diff --git a/lapis/storage_io/storage.py b/lapis/storage_io/storage.py index e9a528f..3529c28 100644 --- a/lapis/storage_io/storage.py +++ b/lapis/storage_io/storage.py @@ -24,7 +24,10 @@ def storage_reader( float(row["cachesizeGB"]) * unit_conversion_mapping.get("cachesizeGB", 1) ), - throughput_limit=int(row["throughput_limit"]), + throughput_limit=int( + float(row["throughput_limit"]) + * unit_conversion_mapping.get("throughput_limit", 1) + ), files=storage_content[row["name"]], )() From ad6eed42d39bb71ecd612101cc806627db5b42de Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Fri, 29 Nov 2019 20:39:38 +0100 Subject: [PATCH 456/648] reformated debug output --- lapis/connection.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lapis/connection.py b/lapis/connection.py index 72432d5..69aec51 100644 --- a/lapis/connection.py +++ b/lapis/connection.py @@ -100,7 +100,8 @@ async def stream_file(self, requested_file: RequestedFile, dronesite, job_repr): await potential_cache.add(requested_file, job_repr) else: print( - f"APPLY CACHING DECISION: Job {job_repr}, File {requested_file.filename}: File wasnt " + f"APPLY CACHING DECISION: Job {job_repr}, " + f"File {requested_file.filename}: File wasnt " f"cached @ {time.now}" ) except KeyError: From 1de14bca62e5f6d9406dfd6c9ffebf653876b6c2 Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Fri, 29 Nov 2019 22:05:17 +0100 Subject: [PATCH 457/648] added debug output and fixed wrong function call --- lapis/storageelement.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/lapis/storageelement.py b/lapis/storageelement.py index 8a27e91..602f9cd 100644 --- a/lapis/storageelement.py +++ b/lapis/storageelement.py @@ -210,11 +210,19 @@ def used(self): return 0 async def transfer(self, file: RequestedFile, job_repr): + print( + "TRANSFER: {}, filesize {}, remote: {}, cache: {}".format( + self._hitrate, + file.filesize, + (1 - self._hitrate) * file.filesize, + self._hitrate * file.filesize, + ) + ) async with Scope() as scope: scope.do(self.connection.transfer(total=self._hitrate * file.filesize)) scope.do( self.remote_storage.connection.transfer( - total=(1 - self._hitrate) * file.filesize, job_repr=job_repr + total=(1 - self._hitrate) * file.filesize ) ) From aa8e68d5951f5a6affdc9ff7a2cd243a6e146919 Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Tue, 3 Dec 2019 09:27:10 +0100 Subject: [PATCH 458/648] reduced debug output, added default value for job_repr --- lapis/connection.py | 10 ++++++---- lapis/scheduler.py | 9 ++++++++- lapis/storageelement.py | 16 ++++++++-------- 3 files changed, 22 insertions(+), 13 deletions(-) diff --git a/lapis/connection.py b/lapis/connection.py index 69aec51..75ac6b6 100644 --- a/lapis/connection.py +++ b/lapis/connection.py @@ -43,7 +43,7 @@ def add_storage_element(self, storage_element: StorageElement): self.storages[storage_element.sitename] = [storage_element] async def _determine_inputfile_source( - self, requested_file: RequestedFile, dronesite: str, job_repr: str + self, requested_file: RequestedFile, dronesite: str, job_repr: str = None ) -> Union[StorageElement, RemoteStorage]: """ Collects NamedTuples containing the amount of data of the requested file @@ -71,7 +71,9 @@ async def _determine_inputfile_source( return entry.storage return self.remote_connection - async def stream_file(self, requested_file: RequestedFile, dronesite, job_repr): + async def stream_file( + self, requested_file: RequestedFile, dronesite, job_repr=None + ): """ Determines which storage object is used to provide the requested file and startes the files transfer. For files transfered via remote connection a @@ -114,7 +116,7 @@ async def stream_file(self, requested_file: RequestedFile, dronesite, job_repr): ) ) - async def transfer_files(self, drone, requested_files: dict, job_repr): + async def transfer_files(self, drone, requested_files: dict, job_repr=None): """ Converts dict information about requested files to RequestedFile object and parallely launches streaming for all files @@ -123,7 +125,7 @@ async def transfer_files(self, drone, requested_files: dict, job_repr): :param job_repr: :return: """ - print("registered caches", self.storages) + start_time = time.now async with Scope() as scope: for inputfilename, inputfilespecs in requested_files.items(): diff --git a/lapis/scheduler.py b/lapis/scheduler.py index 0b4c33d..7c25e6a 100644 --- a/lapis/scheduler.py +++ b/lapis/scheduler.py @@ -89,7 +89,9 @@ async def run(self): scope.do(self._collect_jobs()) async for _ in interval(self.interval): print("NEW SCHEDULING INTERVAL @ {}".format(time.now)) - for job in self.job_queue: + print(self.job_queue) + for job in self.job_queue.copy(): + print("SCHEDULING {}".format(repr(job))) best_match = self._schedule_job(job) if best_match: print( @@ -135,6 +137,11 @@ def _schedule_job(self, job) -> Drone: drone = cluster[0] cost = 0 resources = drone.theoretical_available_resources + # print( + # "trying to match Job {} to {}, resources {}".format( + # repr(job), repr(drone), resources + # ) + # ) for resource_type in job.resources: if resources.get(resource_type, 0) < job.resources[resource_type]: # Inf for all job resources that a drone does not support diff --git a/lapis/storageelement.py b/lapis/storageelement.py index 602f9cd..2a68d98 100644 --- a/lapis/storageelement.py +++ b/lapis/storageelement.py @@ -82,7 +82,7 @@ def used(self): def available(self): return self.size - self.used - async def remove(self, file: StoredFile, job_repr): + async def remove(self, file: StoredFile, job_repr=None): """ Deletes file from storage object. The time this operation takes is defined by the storages deletion_duration attribute. @@ -99,7 +99,7 @@ async def remove(self, file: StoredFile, job_repr): await self._usedstorage.decrease(usedsize=file.filesize) self.files.pop(file.filename) - async def add(self, file: RequestedFile, job_repr): + async def add(self, file: RequestedFile, job_repr=None): """ Adds file to storage object transfering it through the storage objects connection. This should be sufficient for now because files are only added @@ -136,7 +136,7 @@ async def _update(self, stored_file: StoredFile, job_repr): ) ) - async def transfer(self, file: RequestedFile, job_repr): + async def transfer(self, file: RequestedFile, job_repr=None): """ Manages file transfer via the storage elements connection and updates file information. If the file should have been deleted since it was originally @@ -152,7 +152,7 @@ async def transfer(self, file: RequestedFile, job_repr): except AttributeError: pass - def find(self, requested_file: RequestedFile, job_repr): + def find(self, requested_file: RequestedFile, job_repr=None): """ Searches storage object for the requested_file and sends result (amount of cached data, storage object) to the queue. @@ -209,7 +209,7 @@ def available(self): def used(self): return 0 - async def transfer(self, file: RequestedFile, job_repr): + async def transfer(self, file: RequestedFile, job_repr=None): print( "TRANSFER: {}, filesize {}, remote: {}, cache: {}".format( self._hitrate, @@ -226,11 +226,11 @@ async def transfer(self, file: RequestedFile, job_repr): ) ) - def find(self, requested_file: RequestedFile, job_repr): + def find(self, requested_file: RequestedFile, job_repr=None): return LookUpInformation(requested_file.filesize, self) - async def add(self, file: RequestedFile, job_repr): + async def add(self, file: RequestedFile, job_repr=None): pass - async def remove(self, file: StoredFile, job_repr): + async def remove(self, file: StoredFile, job_repr=None): pass From 314029287594ff2df9d13aadfa9f050a1b2b4c83 Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Tue, 3 Dec 2019 09:28:35 +0100 Subject: [PATCH 459/648] Added unittests to test functionality of Connection class with HitrateStorage storage elements --- lapis_tests/__init__.py | 11 ++- lapis_tests/test_caching_hitrate_based.py | 82 +++++++++++++++++++++++ 2 files changed, 91 insertions(+), 2 deletions(-) create mode 100644 lapis_tests/test_caching_hitrate_based.py diff --git a/lapis_tests/__init__.py b/lapis_tests/__init__.py index f749a6b..562d4c0 100644 --- a/lapis_tests/__init__.py +++ b/lapis_tests/__init__.py @@ -1,10 +1,11 @@ -from typing import Callable, Coroutine +from typing import Callable, Coroutine, Optional from functools import wraps from usim import run, Resources from lapis.drone import Drone from lapis.job import Job +from lapis.connection import Connection class UnfinishedTest(RuntimeError): @@ -71,4 +72,10 @@ async def job_finished(self, job: Job): class DummyDrone: - connection = None + sitename = None + + def __init__(self, throughput: Optional[float] = None): + if throughput: + self.connection = Connection(throughput) + else: + self.connection = None diff --git a/lapis_tests/test_caching_hitrate_based.py b/lapis_tests/test_caching_hitrate_based.py new file mode 100644 index 0000000..83d3d6c --- /dev/null +++ b/lapis_tests/test_caching_hitrate_based.py @@ -0,0 +1,82 @@ +from usim import time + +from lapis_tests import via_usim, DummyDrone +from lapis.connection import Connection +from lapis.storageelement import HitrateStorage +from lapis.files import RequestedFile + + +class TestHitrateCaching(object): + def test_hitratestorage(self): + size = 1000 + hitratestorage = HitrateStorage(hitrate=0.5, size=size, files={}) + requested_file = RequestedFile(filename="testfile", filesize=100) + looked_up_file = hitratestorage.find(requested_file, job_repr=None) + + assert size == hitratestorage.available + assert 0 == hitratestorage.used + assert 100 == looked_up_file.cached_filesize + assert hitratestorage == looked_up_file.storage + + @via_usim + async def test_add_storage_to_connection(self): + throughput = 10 + size = 1000 + hitratestorage = HitrateStorage(hitrate=0.5, size=size, files={}) + connection = Connection(throughput=throughput) + connection.add_storage_element(hitratestorage) + assert hitratestorage in connection.storages[hitratestorage.sitename] + + @via_usim + async def test_determine_inputfile_source(self): + throughput = 10 + size = 1000 + requested_file = RequestedFile(filename="testfile", filesize=100) + hitratestorage = HitrateStorage(hitrate=0.5, size=size, files={}) + connection = Connection(throughput=throughput) + connection.add_storage_element(hitratestorage) + cache = await connection._determine_inputfile_source( + requested_file=requested_file, dronesite=None + ) + assert cache is hitratestorage + + @via_usim + async def test_stream_file(self): + throughput = 10 + size = 1000 + requested_file = RequestedFile(filename="testfile", filesize=100) + hitratestorage = HitrateStorage(hitrate=0.5, size=size, files={}) + connection = Connection(throughput=throughput) + connection.add_storage_element(hitratestorage) + assert 0 == time.now + await connection.stream_file(requested_file=requested_file, dronesite=None) + assert 5 == time.now + + @via_usim + async def test_single_transfer_files(self): + throughput = 10 + size = 1000 + drone = DummyDrone(throughput) + requested_files = dict(test=dict(usedsize=100)) + hitratestorage = HitrateStorage(hitrate=0.5, size=size, files={}) + drone.connection.add_storage_element(hitratestorage) + stream_time = await drone.connection.transfer_files( + drone=drone, requested_files=requested_files, job_repr="test" + ) + + assert time.now == 5 + assert stream_time == 5 + + @via_usim + async def test_simultaneous_transfer(self): + throughput = 10 + size = 1000 + drone = DummyDrone(throughput) + requested_files = dict(test1=dict(usedsize=100), test2=dict(usedsize=200)) + hitratestorage = HitrateStorage(hitrate=0.5, size=size, files={}) + drone.connection.add_storage_element(hitratestorage) + stream_time = await drone.connection.transfer_files( + drone=drone, requested_files=requested_files + ) + assert time.now == 15 + assert stream_time == 15 From f2f1d4b843716a2da52e2e36dc1f6f361fbe85fd Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Tue, 3 Dec 2019 10:51:05 +0100 Subject: [PATCH 460/648] extended type hints --- lapis/connection.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/lapis/connection.py b/lapis/connection.py index 75ac6b6..7559eda 100644 --- a/lapis/connection.py +++ b/lapis/connection.py @@ -1,6 +1,6 @@ import random -from typing import Union +from typing import Union, Optional from usim import Scope, time, Pipe from lapis.cachealgorithm import ( @@ -43,7 +43,10 @@ def add_storage_element(self, storage_element: StorageElement): self.storages[storage_element.sitename] = [storage_element] async def _determine_inputfile_source( - self, requested_file: RequestedFile, dronesite: str, job_repr: str = None + self, + requested_file: RequestedFile, + dronesite: Optional[str], + job_repr: Optional[str] = None, ) -> Union[StorageElement, RemoteStorage]: """ Collects NamedTuples containing the amount of data of the requested file From aea8a2106cc756043769697ecac2b898726b27cf Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Tue, 3 Dec 2019 10:52:29 +0100 Subject: [PATCH 461/648] fixed error in _usedstorage calculation --- lapis/storageelement.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapis/storageelement.py b/lapis/storageelement.py index 2a68d98..fd0fe98 100644 --- a/lapis/storageelement.py +++ b/lapis/storageelement.py @@ -65,7 +65,7 @@ def __init__( self._size = size self.files = files self._usedstorage = Resources( - size=sum(file.filesize for file in files.values()) + size=sum(file.storedsize for file in files.values()) ) self.connection = Pipe(throughput_limit) self.remote_storage = None From 2c64bf443c04c2dbed3cc89f55c8e79585e90bb3 Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Tue, 3 Dec 2019 10:53:58 +0100 Subject: [PATCH 462/648] storage.py now catches exception caused by not specified storage content config --- lapis/storage_io/storage.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/lapis/storage_io/storage.py b/lapis/storage_io/storage.py index 3529c28..6728839 100644 --- a/lapis/storage_io/storage.py +++ b/lapis/storage_io/storage.py @@ -13,7 +13,10 @@ def storage_reader( "throughput_limit": 1024 * 1024 * 1024, }, ): - storage_content = storage_content_reader(storage_content) + try: + storage_content = storage_content_reader(storage_content) + except TypeError: + storage_content = dict() reader = csv.DictReader(storage, delimiter=" ", quotechar="'") for row in reader: yield partial( @@ -28,7 +31,7 @@ def storage_reader( float(row["throughput_limit"]) * unit_conversion_mapping.get("throughput_limit", 1) ), - files=storage_content[row["name"]], + files=storage_content.get(row["name"], dict()), )() @@ -36,7 +39,7 @@ def storage_content_reader( file_name, unit_conversion_mapping: dict = { # noqa: B006 "filesize": 1024 * 1024 * 1024, - "usedsize": 1024 * 1024 * 1024, + "storedsize": 1024 * 1024 * 1024, }, ): reader = csv.DictReader(file_name, delimiter=" ", quotechar="'") From e22d3302da3a5b5cb6cb6f2ff42dccb6e994c59e Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Tue, 3 Dec 2019 10:57:26 +0100 Subject: [PATCH 463/648] fixed test_storage unit tests --- lapis_tests/storage_io/test_storage.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lapis_tests/storage_io/test_storage.py b/lapis_tests/storage_io/test_storage.py index 7cdbe17..81fc0da 100644 --- a/lapis_tests/storage_io/test_storage.py +++ b/lapis_tests/storage_io/test_storage.py @@ -27,7 +27,7 @@ def test_empty_files(self): simple_config = self._create_simple_config() count = 0 for storage in storage_reader( - open(simple_config.name, "r"), None, StorageElement + open(simple_config.name, "r+"), None, StorageElement ): assert storage is not None count += 1 @@ -46,8 +46,8 @@ def test_simple_read(self): ): assert storage is not None assert type(storage.available) == int - assert storage.available == int(5.1 * 1024 * 1024 * 1024) + assert storage.available == int(5.0 * 1024 * 1024 * 1024) assert type(storage.size) == int - assert storage.size == int(10.1 * 1024 * 1024 * 1024) + assert storage.size == int(10.0 * 1024 * 1024 * 1024) count += 1 assert count == 1 From 86cd2b291295615bb3ec938b15221c5cabac11e1 Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Tue, 3 Dec 2019 11:01:08 +0100 Subject: [PATCH 464/648] added new unit tests for hitrate based caching --- lapis_tests/test_caching_hitrate_based.py | 129 ++++++++++++++++++++++ 1 file changed, 129 insertions(+) diff --git a/lapis_tests/test_caching_hitrate_based.py b/lapis_tests/test_caching_hitrate_based.py index 83d3d6c..7fd7fc0 100644 --- a/lapis_tests/test_caching_hitrate_based.py +++ b/lapis_tests/test_caching_hitrate_based.py @@ -1,9 +1,18 @@ from usim import time +from tempfile import NamedTemporaryFile +import json +from functools import partial from lapis_tests import via_usim, DummyDrone from lapis.connection import Connection from lapis.storageelement import HitrateStorage +from lapis.storage_io.storage import storage_reader from lapis.files import RequestedFile +from lapis.simulator import Simulator +from lapis.job_io.htcondor import htcondor_job_reader +from lapis.pool import StaticPool +from lapis.pool_io.htcondor import htcondor_pool_reader +from lapis.scheduler import CondorJobScheduler class TestHitrateCaching(object): @@ -80,3 +89,123 @@ async def test_simultaneous_transfer(self): ) assert time.now == 15 assert stream_time == 15 + + @via_usim + async def test_caching_simulation_duration_short_jobs(self): + simulator = Simulator() + with NamedTemporaryFile(suffix=".csv") as machine_config, NamedTemporaryFile( + suffix=".csv" + ) as storage_config, NamedTemporaryFile(suffix=".json") as job_config: + with open(machine_config.name, "w") as write_stream: + write_stream.write( + "TotalSlotCPUs TotalSlotDisk TotalSlotMemory Count sitename\n" + "1 44624348.0 8000 1 site1" + ) + with open(job_config.name, "w") as write_stream: + job_description = [ + { + "QDate": 0, + "RequestCpus": 1, + "RequestWalltime": 60, + "RequestMemory": 1024, + "RequestDisk": 1024, + "RemoteWallClockTime": 1.0, + "MemoryUsage": 1024, + "DiskUsage_RAW": 1024, + "RemoteSysCpu": 1.0, + "RemoteUserCpu": 0.0, + "Inputfiles": dict( + file1=dict(usedsize=10), file2=dict(usedsize=5) + ), + } + ] * 2 + json.dump(job_description, write_stream) + with open(storage_config.name, "w") as write_stream: + write_stream.write( + "name sitename cachesizeGB throughput_limit\n" + "cache1 site1 1000 1.0" + ) + + job_input = open(job_config.name, "r+") + machine_input = open(machine_config.name, "r+") + storage_input = open(storage_config.name, "r+") + storage_content_input = None + cache_hitrate = 0.5 + simulator.create_job_generator( + job_input=job_input, job_reader=htcondor_job_reader + ) + simulator.create_scheduler(scheduler_type=CondorJobScheduler) + simulator.create_connection_module(remote_throughput=1.0) + simulator.create_pools( + pool_input=machine_input, + pool_reader=htcondor_pool_reader, + pool_type=StaticPool, + ) + simulator.create_storage( + storage_input=storage_input, + storage_content_input=storage_content_input, + storage_reader=storage_reader, + storage_type=partial(HitrateStorage, cache_hitrate), + ) + simulator.run() + assert 180 == simulator.duration + + @via_usim + async def test_caching_simulation_duration_long_jobs(self): + simulator = Simulator() + with NamedTemporaryFile(suffix=".csv") as machine_config, NamedTemporaryFile( + suffix=".csv" + ) as storage_config, NamedTemporaryFile(suffix=".json") as job_config: + with open(machine_config.name, "w") as write_stream: + write_stream.write( + "TotalSlotCPUs TotalSlotDisk TotalSlotMemory Count sitename\n" + "1 44624348.0 8000 1 site1" + ) + with open(job_config.name, "w") as write_stream: + job_description = [ + { + "QDate": 0, + "RequestCpus": 1, + "RequestWalltime": 60, + "RequestMemory": 1024, + "RequestDisk": 1024, + "RemoteWallClockTime": 1.0, + "MemoryUsage": 1024, + "DiskUsage_RAW": 1024, + "RemoteSysCpu": 1.0, + "RemoteUserCpu": 0.0, + "Inputfiles": dict( + file1=dict(usedsize=60), file2=dict(usedsize=60) + ), + } + ] * 2 + json.dump(job_description, write_stream) + with open(storage_config.name, "w") as write_stream: + write_stream.write( + "name sitename cachesizeGB throughput_limit\n" + "cache1 site1 1000 1.0" + ) + + job_input = open(job_config.name, "r+") + machine_input = open(machine_config.name, "r+") + storage_input = open(storage_config.name, "r+") + storage_content_input = None + cache_hitrate = 0.5 + simulator.create_job_generator( + job_input=job_input, job_reader=htcondor_job_reader + ) + simulator.create_scheduler(scheduler_type=CondorJobScheduler) + simulator.create_connection_module(remote_throughput=1.0) + simulator.create_pools( + pool_input=machine_input, + pool_reader=htcondor_pool_reader, + pool_type=StaticPool, + ) + simulator.create_storage( + storage_input=storage_input, + storage_content_input=storage_content_input, + storage_reader=storage_reader, + storage_type=partial(HitrateStorage, cache_hitrate), + ) + simulator.run() + assert 300 == simulator.duration From 651345384976665ac9853d63f6a9b74d6ac6ab9e Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Tue, 3 Dec 2019 13:12:01 +0100 Subject: [PATCH 465/648] removed forgotten debug output --- lapis/job_io/htcondor.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/lapis/job_io/htcondor.py b/lapis/job_io/htcondor.py index 3173a39..f4caa6a 100644 --- a/lapis/job_io/htcondor.py +++ b/lapis/job_io/htcondor.py @@ -84,8 +84,6 @@ def htcondor_job_reader( filespecs[key] = filespecs[key] * unit_conversion_mapping.get( key, 1 ) - - print(filespecs) if "usedsize" in filespecs: del resources["inputfiles"][filename]["usedsize"] From bec195ae78ca3fc4ac135dc33a1f49f3d7ad6df3 Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Tue, 3 Dec 2019 13:13:15 +0100 Subject: [PATCH 466/648] storage_content_reader() handles empty files correctly now --- lapis/storage_io/storage.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/lapis/storage_io/storage.py b/lapis/storage_io/storage.py index 6728839..526f8f1 100644 --- a/lapis/storage_io/storage.py +++ b/lapis/storage_io/storage.py @@ -51,6 +51,4 @@ def storage_content_reader( cache_information.setdefault(row["cachename"], {})[ row["filename"] ] = StoredFile(**row) - if not cache_information: - return None return cache_information From 1a51eacfee5b6761e4ecce9fbcee86dcb95f18ba Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Tue, 3 Dec 2019 13:22:29 +0100 Subject: [PATCH 467/648] fixed line length --- lapis_tests/storage_io/test_storage.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/lapis_tests/storage_io/test_storage.py b/lapis_tests/storage_io/test_storage.py index 81fc0da..68adb46 100644 --- a/lapis_tests/storage_io/test_storage.py +++ b/lapis_tests/storage_io/test_storage.py @@ -10,7 +10,8 @@ def _create_simple_config(self, to_string=False): with open(storage_config.name, "w") as write_stream: write_stream.write( f"name sitename cachesizeGB throughput_limit\n" - f"name sitename {str(10) if to_string else 10} {str(10.1) if to_string else 10.1} {str(1) if to_string else 1}" + f"name sitename {str(10) if to_string else 10} " + f"{str(10.1) if to_string else 10.1} {str(1) if to_string else 1}" ) return storage_config @@ -18,8 +19,12 @@ def _create_simple_files(self, to_string=False): file_config = NamedTemporaryFile(suffix=".csv") with open(file_config.name, "w") as write_stream: write_stream.write( - f"filename cachename filesize storedsize cachedsince lastaccessed numberofaccesses\n" - f"file name {str(10.1) if to_string else 10.1} {str(5.0) if to_string else 5.0} {str(0) if to_string else 0} {str(0) if to_string else 0} {str(1) if to_string else 1}" + f"filename cachename filesize storedsize cachedsince lastaccessed " + f"numberofaccesses\n" + f"file name {str(10.1) if to_string else 10.1} " + f"{str(5.0) if to_string else 5.0} " + f"{str(0) if to_string else 0} {str(0) if to_string else 0} " + f"{str(1) if to_string else 1}" ) return file_config From 0919d4a292fdb2692ba07392774f9dc881f095d8 Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Tue, 3 Dec 2019 13:27:11 +0100 Subject: [PATCH 468/648] fixed line length --- lapis/connection.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/lapis/connection.py b/lapis/connection.py index 7559eda..41562e6 100644 --- a/lapis/connection.py +++ b/lapis/connection.py @@ -137,7 +137,9 @@ async def transfer_files(self, drone, requested_files: dict, job_repr=None): ) scope.do(self.stream_file(requested_file, drone.sitename, job_repr)) stream_time = time.now - start_time - print( - "STREAMED files {} in {}".format(list(requested_files.keys()), stream_time) - ) + + # print( + # "STREAMED files {} in {}".format(list(requested_files.keys()), + # stream_time) + # ) return stream_time From 26c143ba3b07ec0792a98ead0005ae6233c2def9 Mon Sep 17 00:00:00 2001 From: Max Fischer Date: Tue, 3 Dec 2019 14:33:42 +0100 Subject: [PATCH 469/648] Monitor.run now properly closes the sampling aiter --- lapis/monitor/__init__.py | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/lapis/monitor/__init__.py b/lapis/monitor/__init__.py index 9098cca..f9bf34a 100644 --- a/lapis/monitor/__init__.py +++ b/lapis/monitor/__init__.py @@ -45,12 +45,20 @@ def __init__(self): self._statistics = {} async def run(self): - async for log_object in sampling_required: - for statistic in self._statistics.get(type(log_object), set()): - # do the logging - for record in statistic(log_object): - record["tardis"] = "lapis-%s" % SIMULATION_START - logging.getLogger(statistic.name).info(statistic.name, record) + # The Queue.__aiter__ cannot safely be finalised unless closed. + # We explicitly create and later on aclose it, to ensure this happens + # when the Scope collects us and the event loop is still around. + log_iter = sampling_required.__aiter__() + try: + async for log_object in log_iter: + for statistic in self._statistics.get(type(log_object), set()): + # do the logging + for record in statistic(log_object): + record["tardis"] = "lapis-%s" % SIMULATION_START + logging.getLogger(statistic.name).info(statistic.name, record) + except GeneratorExit: + await log_iter.aclose() + def register_statistic(self, statistic: Callable) -> None: """ From cd5daba24ecc80d8931a53a88a3094e1369c21e7 Mon Sep 17 00:00:00 2001 From: Max Fischer Date: Tue, 3 Dec 2019 14:47:51 +0100 Subject: [PATCH 470/648] made the linter happy --- lapis/monitor/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/lapis/monitor/__init__.py b/lapis/monitor/__init__.py index f9bf34a..47feb59 100644 --- a/lapis/monitor/__init__.py +++ b/lapis/monitor/__init__.py @@ -59,7 +59,6 @@ async def run(self): except GeneratorExit: await log_iter.aclose() - def register_statistic(self, statistic: Callable) -> None: """ Register a callable that takes an object for logging and generates a list From 86b8f4c24a2728312cb47774dd0f869c8b4ec450 Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Tue, 3 Dec 2019 15:01:39 +0100 Subject: [PATCH 471/648] added default value for calculation_efficiency --- lapis/cli/simulate.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapis/cli/simulate.py b/lapis/cli/simulate.py index 619e55f..293c646 100644 --- a/lapis/cli/simulate.py +++ b/lapis/cli/simulate.py @@ -38,7 +38,7 @@ @click.option("--log-tcp", "log_tcp", is_flag=True) @click.option("--log-file", "log_file", type=click.File("w")) @click.option("--log-telegraf", "log_telegraf", is_flag=True) -@click.option("--calculation-efficiency", type=float) +@click.option("--calculation-efficiency", type=float, default=0.9) @click.pass_context def cli(ctx, seed, until, log_tcp, log_file, log_telegraf, calculation_efficiency): ctx.ensure_object(dict) From 8cb78d0628dfbe7807d3706d25f2bb496ebe2fee Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Tue, 3 Dec 2019 15:15:18 +0100 Subject: [PATCH 472/648] remove calculation_efficiency default value --- lapis/cli/simulate.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapis/cli/simulate.py b/lapis/cli/simulate.py index 293c646..619e55f 100644 --- a/lapis/cli/simulate.py +++ b/lapis/cli/simulate.py @@ -38,7 +38,7 @@ @click.option("--log-tcp", "log_tcp", is_flag=True) @click.option("--log-file", "log_file", type=click.File("w")) @click.option("--log-telegraf", "log_telegraf", is_flag=True) -@click.option("--calculation-efficiency", type=float, default=0.9) +@click.option("--calculation-efficiency", type=float) @click.pass_context def cli(ctx, seed, until, log_tcp, log_file, log_telegraf, calculation_efficiency): ctx.ensure_object(dict) From 56f30775afc3dcb40a7e66ddaa190ce00e85fae4 Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Tue, 3 Dec 2019 20:20:10 +0100 Subject: [PATCH 473/648] corrected wrong attribute names --- lapis/storageelement.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lapis/storageelement.py b/lapis/storageelement.py index fd0fe98..a99d4c2 100644 --- a/lapis/storageelement.py +++ b/lapis/storageelement.py @@ -96,7 +96,7 @@ async def remove(self, file: StoredFile, job_repr=None): ) ) await (time + self.deletion_duration) - await self._usedstorage.decrease(usedsize=file.filesize) + await self._usedstorage.decrease(size=file.filesize) self.files.pop(file.filename) async def add(self, file: RequestedFile, job_repr=None): @@ -116,7 +116,7 @@ async def add(self, file: RequestedFile, job_repr=None): ) ) file = file.convert_to_stored_file_object(time.now) - await self._usedstorage.increase(usedsize=file.filesize) + await self._usedstorage.increase(size=file.filesize) self.files[file.filename] = file await self.connection.transfer(file.filesize) From ddb0932f721038b139c5ff3fe2f3b81717d61c70 Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Tue, 3 Dec 2019 20:21:20 +0100 Subject: [PATCH 474/648] fixed usage of transfer() interface and debug outputs --- lapis/connection.py | 10 ++++------ lapis/job.py | 5 ++++- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/lapis/connection.py b/lapis/connection.py index 41562e6..a8f59f4 100644 --- a/lapis/connection.py +++ b/lapis/connection.py @@ -112,7 +112,7 @@ async def stream_file( except KeyError: pass print(f"now transfering {requested_file.filesize} from {used_connection}") - await used_connection.transfer(requested_file, job_repr) + await used_connection.transfer(requested_file, job_repr=job_repr) print( "Job {}: finished transfering of file {}: {}GB @ {}".format( job_repr, requested_file.filename, requested_file.filesize, time.now @@ -137,9 +137,7 @@ async def transfer_files(self, drone, requested_files: dict, job_repr=None): ) scope.do(self.stream_file(requested_file, drone.sitename, job_repr)) stream_time = time.now - start_time - - # print( - # "STREAMED files {} in {}".format(list(requested_files.keys()), - # stream_time) - # ) + print( + "STREAMED files {} in {}".format(list(requested_files.keys()), stream_time) + ) return stream_time diff --git a/lapis/job.py b/lapis/job.py index fd15b93..e7cf698 100644 --- a/lapis/job.py +++ b/lapis/job.py @@ -103,7 +103,10 @@ async def _calculate(self): :param calculation_efficiency: :return: """ - print(f"WALLTIME: Job {self} @ {time.now}") + print( + f"WALLTIME: Job {self} @ {time.now}, {self.used_resources['cores']}, " + f"{self.calculation_efficiency}" + ) result = self.walltime try: result = ( From a04337c6e90457d02e345bbea7f8ad69647d9655 Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Tue, 3 Dec 2019 20:26:42 +0100 Subject: [PATCH 475/648] fixed debug output leading to failing unit test --- lapis/job.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lapis/job.py b/lapis/job.py index e7cf698..33bf108 100644 --- a/lapis/job.py +++ b/lapis/job.py @@ -104,7 +104,8 @@ async def _calculate(self): :return: """ print( - f"WALLTIME: Job {self} @ {time.now}, {self.used_resources['cores']}, " + f"WALLTIME: Job {self} @ {time.now}, " + f"{self.used_resources.get('cores', None)}, " f"{self.calculation_efficiency}" ) result = self.walltime From 6809f98cbfa9417304a261d5d743acb5c941dd0f Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Wed, 4 Dec 2019 14:03:32 +0100 Subject: [PATCH 476/648] fixed bug in filesize unit conversion --- lapis/job_io/htcondor.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/lapis/job_io/htcondor.py b/lapis/job_io/htcondor.py index f4caa6a..9089eaf 100644 --- a/lapis/job_io/htcondor.py +++ b/lapis/job_io/htcondor.py @@ -81,19 +81,25 @@ def htcondor_job_reader( used_resources["inputfiles"] = deepcopy(entry["Inputfiles"]) for filename, filespecs in entry["Inputfiles"].items(): for key in filespecs.keys(): - filespecs[key] = filespecs[key] * unit_conversion_mapping.get( - key, 1 - ) + resources["inputfiles"][filename][key] = filespecs[ + key + ] * unit_conversion_mapping.get(key, 1) + used_resources["inputfiles"][filename][key] = filespecs[ + key + ] * unit_conversion_mapping.get(key, 1) + if "usedsize" in filespecs: del resources["inputfiles"][filename]["usedsize"] if "filesize" in filespecs: if "usedsize" not in filespecs: - used_resources["inputfiles"][filename]["usedsize"] = filespecs[ - "filesize" - ] + used_resources["inputfiles"][filename]["usedsize"] = resources[ + "inputfiles" + ][filename]["filesize"] del used_resources["inputfiles"][filename]["filesize"] + print(resources["inputfiles"], used_resources["inputfiles"]) + except KeyError: pass yield Job( From 21585709d6be8709236b21f8961de936928fb5dd Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Mon, 20 Jan 2020 10:33:21 +0100 Subject: [PATCH 477/648] Extended hitrate based caching to support different cache hitrates for every file and cache --- lapis/connection.py | 15 ++++++-- lapis/files.py | 6 +++ lapis/job_io/htcondor.py | 9 ++++- lapis/simulator.py | 2 +- lapis/storage_io/storage.py | 29 ++++++++++++++ lapis/storageelement.py | 75 ++++++++++++++++++++++++++++++++++++- 6 files changed, 127 insertions(+), 9 deletions(-) diff --git a/lapis/connection.py b/lapis/connection.py index a8f59f4..9a677a3 100644 --- a/lapis/connection.py +++ b/lapis/connection.py @@ -10,7 +10,7 @@ delete_oldest_few_used, ) from lapis.storageelement import StorageElement, RemoteStorage -from lapis.files import RequestedFile +from lapis.files import RequestedFile, RequestedFile_HitrateBased from lapis.monitor import sampling_required @@ -132,9 +132,16 @@ async def transfer_files(self, drone, requested_files: dict, job_repr=None): start_time = time.now async with Scope() as scope: for inputfilename, inputfilespecs in requested_files.items(): - requested_file = RequestedFile( - inputfilename, inputfilespecs["usedsize"] - ) + if "hitrates" in inputfilespecs.keys(): + requested_file = RequestedFile_HitrateBased( + inputfilename, + inputfilespecs["usedsize"], + inputfilespecs["hitrates"], + ) + else: + requested_file = RequestedFile( + inputfilename, inputfilespecs["usedsize"] + ) scope.do(self.stream_file(requested_file, drone.sitename, job_repr)) stream_time = time.now - start_time print( diff --git a/lapis/files.py b/lapis/files.py index b7871aa..e227447 100644 --- a/lapis/files.py +++ b/lapis/files.py @@ -46,3 +46,9 @@ def convert_to_stored_file_object(self, currenttime): lastaccessed=currenttime, numberofaccesses=1, ) + + +class RequestedFile_HitrateBased(NamedTuple): + filename: str + filesize: float + cachehitrate: dict diff --git a/lapis/job_io/htcondor.py b/lapis/job_io/htcondor.py index 9089eaf..87d94a6 100644 --- a/lapis/job_io/htcondor.py +++ b/lapis/job_io/htcondor.py @@ -74,6 +74,10 @@ def htcondor_job_reader( * unit_conversion_mapping.get(original_key, 1) ) + calculation_efficiency = entry.get( + "calculation_efficiency", calculation_efficiency + ) + try: if not entry["Inputfiles"]: del entry["Inputfiles"] @@ -81,6 +85,8 @@ def htcondor_job_reader( used_resources["inputfiles"] = deepcopy(entry["Inputfiles"]) for filename, filespecs in entry["Inputfiles"].items(): for key in filespecs.keys(): + if key == "hitrates": + continue resources["inputfiles"][filename][key] = filespecs[ key ] * unit_conversion_mapping.get(key, 1) @@ -98,8 +104,6 @@ def htcondor_job_reader( ][filename]["filesize"] del used_resources["inputfiles"][filename]["filesize"] - print(resources["inputfiles"], used_resources["inputfiles"]) - except KeyError: pass yield Job( @@ -107,4 +111,5 @@ def htcondor_job_reader( used_resources=used_resources, queue_date=float(entry[used_resource_name_mapping["queuetime"]]), calculation_efficiency=calculation_efficiency, + name=entry.get("name", None), ) diff --git a/lapis/simulator.py b/lapis/simulator.py index 2bed938..d64671e 100644 --- a/lapis/simulator.py +++ b/lapis/simulator.py @@ -69,7 +69,7 @@ def create_pools(self, pool_input, pool_reader, pool_type, controller=None): self.controllers.append(controller(target=pool, rate=1)) def create_storage( - self, storage_input, storage_content_input, storage_reader, storage_type + self, storage_input, storage_reader, storage_type, storage_content_input=None ): assert self.connection, "Connection module needs to be created before storages" for storage in storage_reader( diff --git a/lapis/storage_io/storage.py b/lapis/storage_io/storage.py index 526f8f1..b31c6b1 100644 --- a/lapis/storage_io/storage.py +++ b/lapis/storage_io/storage.py @@ -52,3 +52,32 @@ def storage_content_reader( row["filename"] ] = StoredFile(**row) return cache_information + + +def storage_reader_filebased_hitrate_caching( + storage, + storage_type, + storage_content=None, + unit_conversion_mapping: dict = { # noqa: B006 + "cachesizeGB": 1024 * 1024 * 1024, + "throughput_limit": 1024 * 1024 * 1024, + }, +): + + reader = csv.DictReader(storage, delimiter=" ", quotechar="'") + for row in reader: + print(row) + yield partial( + storage_type, + name=row["name"], + sitename=row["sitename"], + size=int( + float(row["cachesizeGB"]) + * unit_conversion_mapping.get("cachesizeGB", 1) + ), + throughput_limit=int( + float(row["throughput_limit"]) + * unit_conversion_mapping.get("throughput_limit", 1) + ), + files=dict(), + )() diff --git a/lapis/storageelement.py b/lapis/storageelement.py index a99d4c2..3f4f2f3 100644 --- a/lapis/storageelement.py +++ b/lapis/storageelement.py @@ -2,9 +2,11 @@ from usim import time, Resources, Pipe, Scope -from lapis.files import StoredFile, RequestedFile +from lapis.files import StoredFile, RequestedFile, RequestedFile_HitrateBased from lapis.interfaces._storage import Storage, LookUpInformation +import logging + class RemoteStorage(Storage): def __init__(self, pipe: Pipe): @@ -211,14 +213,24 @@ def used(self): async def transfer(self, file: RequestedFile, job_repr=None): print( - "TRANSFER: {}, filesize {}, remote: {}, cache: {}".format( + "TRANSFER: {}, filesize {}, remote: {}/{}, cache: {}/{}".format( self._hitrate, file.filesize, (1 - self._hitrate) * file.filesize, + self.remote_storage.connection.throughput, self._hitrate * file.filesize, + self.connection.throughput, ) ) async with Scope() as scope: + logging.getLogger("implementation").warning( + "{} {} @ {} in {}".format( + self._hitrate * file.filesize, + (1 - self._hitrate) * file.filesize, + time.now, + file.filename[-30:], + ) + ) scope.do(self.connection.transfer(total=self._hitrate * file.filesize)) scope.do( self.remote_storage.connection.transfer( @@ -234,3 +246,62 @@ async def add(self, file: RequestedFile, job_repr=None): async def remove(self, file: StoredFile, job_repr=None): pass + + +class FileBasedHitrateStorage(StorageElement): + def __init__( + self, + name: Optional[str] = None, + sitename: Optional[str] = None, + size: int = 1000 * 1024 * 1024 * 1024, + throughput_limit: int = 10 * 1024 * 1024 * 1024, + files: Optional[dict] = None, + ): + super(FileBasedHitrateStorage, self).__init__( + name=name, + sitename=sitename, + size=size, + throughput_limit=throughput_limit, + files=files, + ) + + @property + def available(self): + return self.size + + @property + def used(self): + return 0 + + async def transfer(self, file: RequestedFile_HitrateBased, job_repr=None): + current_cachehitrate = file.cachehitrate.get(self.name, 0) + print( + "TRANSFER: on {} with {}, filesize {}, remote: {}/{}, cache: {}/{}".format( + self.name, + file.cachehitrate.get(self.name, 0), + file.filesize, + (1 - current_cachehitrate) * file.filesize, + self.remote_storage.connection.throughput, + current_cachehitrate * file.filesize, + self.connection.throughput, + ) + ) + async with Scope() as scope: + + scope.do( + self.connection.transfer(total=current_cachehitrate * file.filesize) + ) + scope.do( + self.remote_storage.connection.transfer( + total=(1 - current_cachehitrate) * file.filesize + ) + ) + + def find(self, requested_file: RequestedFile, job_repr=None): + return LookUpInformation(requested_file.filesize, self) + + async def add(self, file: RequestedFile, job_repr=None): + pass + + async def remove(self, file: StoredFile, job_repr=None): + pass From b8262612373c8cf02ab0ce952c73748aa50c9400 Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Fri, 7 Feb 2020 16:08:24 +0100 Subject: [PATCH 478/648] adjusted unit conversion --- lapis/job_io/htcondor.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/lapis/job_io/htcondor.py b/lapis/job_io/htcondor.py index 87d94a6..cb50b1d 100644 --- a/lapis/job_io/htcondor.py +++ b/lapis/job_io/htcondor.py @@ -26,13 +26,13 @@ def htcondor_job_reader( "RequestCpus": 1, "RequestWalltime": 1, "RequestMemory": 1024 * 1024, - "RequestDisk": 1024, + "RequestDisk": 1024, # KBytes "queuetime": 1, "RemoteWallClockTime": 1, - "MemoryUsage": 1000 * 1000, - "DiskUsage_RAW": 1024, - "filesize": 1024 * 1024 * 1024, - "usedsize": 1024 * 1024 * 1024, + "MemoryUsage": 1000 * 1000, # MB + "DiskUsage_RAW": 1024, # KBytes + "filesize": 1000 * 1000 * 1000, # GB + "usedsize": 1000 * 1000 * 1000, # GB }, ): input_file_type = iterable.name.split(".")[-1].lower() From 2d101d40ea508ae5b66dd1234d199a04bb555311 Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Fri, 7 Feb 2020 16:09:44 +0100 Subject: [PATCH 479/648] adjusted pipe monitoring --- lapis/monitor/caching.py | 116 ++++++++++++++++++++++++++------------- 1 file changed, 77 insertions(+), 39 deletions(-) diff --git a/lapis/monitor/caching.py b/lapis/monitor/caching.py index 22ca822..598b12b 100644 --- a/lapis/monitor/caching.py +++ b/lapis/monitor/caching.py @@ -1,13 +1,22 @@ import logging +from typing import NamedTuple, Optional + from cobald.monitor.format_json import JsonFormatter from cobald.monitor.format_line import LineProtocolFormatter -from usim import Pipe from lapis.monitor import LoggingSocketHandler, LoggingUDPSocketHandler from lapis.storageelement import StorageElement +class MonitoredPipeInfo(NamedTuple): + requested_throughput: float + available_throughput: float + pipename: Optional[str] + throughputscale: float + no_subscriptions: int + + def storage_status(storage: StorageElement) -> list: """ Log information about current storage object state @@ -36,56 +45,85 @@ def storage_status(storage: StorageElement) -> list: } -def storage_connection(storage: StorageElement) -> list: - """ - Log information about the storages connection - :param storage: - :return: +def pipe_status(pipeinfo: MonitoredPipeInfo) -> list: """ + # Log information about the pipes + # :param storage: + # :return: + # """ results = [ { - "storage": repr(storage), - "throughput": storage.connection.throughput, - "requested_throughput": sum(storage.connection._subscriptions.values()), - "throughput_scale": storage.connection._throughput_scale, + "pipe": repr(pipeinfo.pipename), + "throughput": pipeinfo.available_throughput, + "requested_throughput": pipeinfo.requested_throughput, + "throughput_scale": pipeinfo.throughputscale, + "no_subscribers": pipeinfo.no_subscriptions, } ] return results -storage_connection.name = "storage_connection" -storage_connection.whitelist = (StorageElement,) -storage_connection.logging_formatter = { +pipe_status.name = "pipe_status" +pipe_status.whitelist = (MonitoredPipeInfo,) +pipe_status.logging_formatter = { LoggingSocketHandler.__name__: JsonFormatter(), logging.StreamHandler.__name__: JsonFormatter(), LoggingUDPSocketHandler.__name__: LineProtocolFormatter( - tags={"tardis", "storage"}, resolution=1 + tags={"tardis", "pipe"}, resolution=1 ), } -def remote_connection(remote: Pipe) -> list: - """ - Log information about the remote connection - :param remote: - :return: - """ - results = [ - { - "throughput": remote.throughput, - "requested_throughput": sum(remote._subscriptions.values()), - "throughput_scale": remote._throughput_scale, - } - ] - return results - - -remote_connection.name = "remote_connection" -remote_connection.whitelist = (Pipe,) -remote_connection.logging_formatter = { - LoggingSocketHandler.__name__: JsonFormatter(), - logging.StreamHandler.__name__: JsonFormatter(), - LoggingUDPSocketHandler.__name__: LineProtocolFormatter( - tags={"tardis"}, resolution=1 - ), -} +# def storage_connection(storage: StorageElement) -> list: +# """ +# Log information about the storages connection +# :param storage: +# :return: +# """ +# results = [ +# { +# "storage": repr(storage), +# "throughput": storage.connection.throughput, +# "requested_throughput": sum(storage.connection._subscriptions.values()), +# "throughput_scale": storage.connection._throughput_scale, +# } +# ] +# return results +# +# +# storage_connection.name = "storage_connection" +# storage_connection.whitelist = (StorageElement,) +# storage_connection.logging_formatter = { +# LoggingSocketHandler.__name__: JsonFormatter(), +# logging.StreamHandler.__name__: JsonFormatter(), +# LoggingUDPSocketHandler.__name__: LineProtocolFormatter( +# tags={"tardis", "storage"}, resolution=1 +# ), +# } +# +# +# def remote_connection(remote: Pipe) -> list: +# """ +# Log information about the remote connection +# :param remote: +# :return: +# """ +# results = [ +# { +# "throughput": remote.throughput, +# "requested_throughput": sum(remote._subscriptions.values()), +# "throughput_scale": remote._throughput_scale, +# } +# ] +# return results +# +# +# remote_connection.name = "remote_connection" +# remote_connection.whitelist = (Pipe,) +# remote_connection.logging_formatter = { +# LoggingSocketHandler.__name__: JsonFormatter(), +# logging.StreamHandler.__name__: JsonFormatter(), +# LoggingUDPSocketHandler.__name__: LineProtocolFormatter( +# tags={"tardis"}, resolution=1 +# ), +# } From 7cf07024226d203cd39a9702ce4c872424cc69b0 Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Fri, 7 Feb 2020 16:12:33 +0100 Subject: [PATCH 480/648] adjusted resource ratio calculation to allow for resource levels of 0, added drone_statistics_caching --- lapis/monitor/general.py | 75 ++++++++++++++++++++++++++++++++-------- 1 file changed, 60 insertions(+), 15 deletions(-) diff --git a/lapis/monitor/general.py b/lapis/monitor/general.py index be6d24d..ab46b2b 100644 --- a/lapis/monitor/general.py +++ b/lapis/monitor/general.py @@ -26,18 +26,31 @@ def resource_statistics(drone: Drone) -> List[Dict]: resources = drone.theoretical_available_resources used_resources = drone.available_resources for resource_type in resources: - results.append( - { - "resource_type": resource_type, - "pool_configuration": "None", - "pool_type": "drone", - "pool": repr(drone), - "used_ratio": 1 - - used_resources[resource_type] / drone.pool_resources[resource_type], - "requested_ratio": 1 - - resources[resource_type] / drone.pool_resources[resource_type], - } - ) + try: + results.append( + { + "resource_type": resource_type, + "pool_configuration": "None", + "pool_type": "drone", + "pool": repr(drone), + "used_ratio": 1 + - used_resources[resource_type] + / drone.pool_resources[resource_type], + "requested_ratio": 1 + - resources[resource_type] / drone.pool_resources[resource_type], + } + ) + except ZeroDivisionError: + results.append( + { + "resource_type": resource_type, + "pool_configuration": "None", + "pool_type": "drone", + "pool": repr(drone), + "used_ratio": 1, + "requested_ratio": 1, + } + ) return results @@ -89,9 +102,8 @@ def job_statistics(scheduler: CondorJobScheduler) -> List[Dict]: :return: list of records for logging """ result = 0 - for cluster in scheduler.drone_cluster.copy(): - for drone in cluster: - result += drone.jobs + for drone in scheduler.drone_list: + result += drone.jobs return [ { "pool_configuration": "None", @@ -143,6 +155,7 @@ def job_events(job: Job) -> List[Dict]: if job.successful is None: result["queue_time"] = job.queue_date result["waiting_time"] = job.waiting_time + result["starting"] = 1 elif job.successful: result["wall_time"] = job.walltime result["success"] = 1 @@ -215,3 +228,35 @@ def configuration_information(simulator: "Simulator") -> List[Dict]: tags={"tardis", "pool_configuration", "resource_type"}, resolution=1 ), } + + +def drone_statistics_caching(drone: Drone) -> List[Dict]: + """ + + + :param drone: the drone + :return: list of records for logging + """ + full_resources = drone.pool_resources + resources = drone.theoretical_available_resources + + results = [ + { + "pool_type": "drone", + "pool": repr(drone), + "claimed_slots": full_resources["cores"] - resources["cores"], + "free_slots": resources["cores"], + } + ] + return results + + +drone_statistics_caching.name = "drone_status_caching" +drone_statistics_caching.whitelist = (Drone,) +drone_statistics_caching.logging_formatter = { + LoggingSocketHandler.__name__: JsonFormatter(), + logging.StreamHandler.__name__: JsonFormatter(), + LoggingUDPSocketHandler.__name__: LineProtocolFormatter( + tags={"tardis", "pool_type", "pool"}, resolution=1 + ), +} From f724df636ba0a8cb320d137b7881157dc904e1b1 Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Fri, 7 Feb 2020 16:15:04 +0100 Subject: [PATCH 481/648] adjusted unit conversion and transfer for file wise defined caching hitrates --- lapis/storageelement.py | 52 ++++++++++++++++++++--------------------- 1 file changed, 26 insertions(+), 26 deletions(-) diff --git a/lapis/storageelement.py b/lapis/storageelement.py index 3f4f2f3..55bf6e1 100644 --- a/lapis/storageelement.py +++ b/lapis/storageelement.py @@ -1,6 +1,7 @@ from typing import Optional -from usim import time, Resources, Pipe, Scope +from usim import time, Resources, Scope +from monitoredpipe import MonitoredPipe from lapis.files import StoredFile, RequestedFile, RequestedFile_HitrateBased from lapis.interfaces._storage import Storage, LookUpInformation @@ -9,8 +10,9 @@ class RemoteStorage(Storage): - def __init__(self, pipe: Pipe): + def __init__(self, pipe: MonitoredPipe): self.connection = pipe + pipe.storage = repr(self) @property def size(self): @@ -56,8 +58,8 @@ def __init__( self, name: Optional[str] = None, sitename: Optional[str] = None, - size: int = 1000 * 1024 * 1024 * 1024, - throughput_limit: int = 10 * 1024 * 1024 * 1024, + size: int = 1000 * 1000 * 1000 * 1000, + throughput_limit: int = 10 * 1000 * 1000 * 1000, files: Optional[dict] = None, ): self.name = name @@ -69,7 +71,9 @@ def __init__( self._usedstorage = Resources( size=sum(file.storedsize for file in files.values()) ) - self.connection = Pipe(throughput_limit) + self.connection = MonitoredPipe(throughput_limit) + self.connection.storage = repr(self) + self.remote_storage = None @property @@ -190,8 +194,8 @@ def __init__( hitrate, name: Optional[str] = None, sitename: Optional[str] = None, - size: int = 1000 * 1024 * 1024 * 1024, - throughput_limit: int = 10 * 1024 * 1024 * 1024, + size: int = 1000 * 1000 * 1000 * 1000, + throughput_limit: int = 10 * 1000 * 1000 * 1000, files: Optional[dict] = None, ): super(HitrateStorage, self).__init__( @@ -253,8 +257,8 @@ def __init__( self, name: Optional[str] = None, sitename: Optional[str] = None, - size: int = 1000 * 1024 * 1024 * 1024, - throughput_limit: int = 10 * 1024 * 1024 * 1024, + size: int = 1000 * 1000 * 1000 * 1000, + throughput_limit: int = 10 * 1000 * 1000 * 1000, files: Optional[dict] = None, ): super(FileBasedHitrateStorage, self).__init__( @@ -274,31 +278,27 @@ def used(self): return 0 async def transfer(self, file: RequestedFile_HitrateBased, job_repr=None): - current_cachehitrate = file.cachehitrate.get(self.name, 0) print( "TRANSFER: on {} with {}, filesize {}, remote: {}/{}, cache: {}/{}".format( self.name, - file.cachehitrate.get(self.name, 0), + file.cachehitrate, file.filesize, - (1 - current_cachehitrate) * file.filesize, + (1 - file.cachehitrate) * file.filesize, self.remote_storage.connection.throughput, - current_cachehitrate * file.filesize, + file.cachehitrate * file.filesize, self.connection.throughput, ) ) - async with Scope() as scope: - - scope.do( - self.connection.transfer(total=current_cachehitrate * file.filesize) - ) - scope.do( - self.remote_storage.connection.transfer( - total=(1 - current_cachehitrate) * file.filesize - ) - ) - - def find(self, requested_file: RequestedFile, job_repr=None): - return LookUpInformation(requested_file.filesize, self) + if file.cachehitrate: + await self.connection.transfer(total=file.filesize) + else: + await self.remote_storage.connection.transfer(total=file.filesize) + + def find(self, requested_file: RequestedFile_HitrateBased, job_repr=None): + # return LookUpInformation(requested_file.filesize, self) + return LookUpInformation( + requested_file.filesize * requested_file.cachehitrate, self + ) async def add(self, file: RequestedFile, job_repr=None): pass From 4878638449ac352e6cfc3e9bc79a9555a6a98d63 Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Fri, 7 Feb 2020 16:15:35 +0100 Subject: [PATCH 482/648] adjusted unit conversion --- lapis/storage_io/storage.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/lapis/storage_io/storage.py b/lapis/storage_io/storage.py index b31c6b1..1d1fd48 100644 --- a/lapis/storage_io/storage.py +++ b/lapis/storage_io/storage.py @@ -9,8 +9,8 @@ def storage_reader( storage_content, storage_type, unit_conversion_mapping: dict = { # noqa: B006 - "cachesizeGB": 1024 * 1024 * 1024, - "throughput_limit": 1024 * 1024 * 1024, + "cachesizeGB": 1000 * 1000 * 1000, # GB + "throughput_limit": 1000 * 1000 * 1000, # GB }, ): try: @@ -38,8 +38,8 @@ def storage_reader( def storage_content_reader( file_name, unit_conversion_mapping: dict = { # noqa: B006 - "filesize": 1024 * 1024 * 1024, - "storedsize": 1024 * 1024 * 1024, + "filesize": 1000 * 1000 * 1000, + "storedsize": 1000 * 1000 * 1000, }, ): reader = csv.DictReader(file_name, delimiter=" ", quotechar="'") @@ -59,14 +59,13 @@ def storage_reader_filebased_hitrate_caching( storage_type, storage_content=None, unit_conversion_mapping: dict = { # noqa: B006 - "cachesizeGB": 1024 * 1024 * 1024, - "throughput_limit": 1024 * 1024 * 1024, + "cachesizeGB": 1000 * 1000 * 1000, + "throughput_limit": 1000 * 1000 * 1000, }, ): reader = csv.DictReader(storage, delimiter=" ", quotechar="'") for row in reader: - print(row) yield partial( storage_type, name=row["name"], From e853f039146600f3c1b028f9c2ed952eaa441513 Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Fri, 7 Feb 2020 16:16:10 +0100 Subject: [PATCH 483/648] added pipe monitoring --- lapis/simulator.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/lapis/simulator.py b/lapis/simulator.py index d64671e..b472811 100644 --- a/lapis/simulator.py +++ b/lapis/simulator.py @@ -12,7 +12,7 @@ from lapis.drone import Drone from lapis.job import job_to_queue_scheduler from lapis.connection import Connection -from lapis.monitor.caching import storage_status, storage_connection, remote_connection +from lapis.monitor.caching import storage_status, pipe_status from lapis.monitor.general import ( user_demand, job_statistics, @@ -20,6 +20,7 @@ pool_status, configuration_information, job_events, + drone_statistics_caching, ) from lapis.monitor.cobald import drone_statistics, pool_statistics from lapis.pool import Pool @@ -50,8 +51,8 @@ def enable_monitoring(self): self.monitoring.register_statistic(pool_status) self.monitoring.register_statistic(configuration_information) self.monitoring.register_statistic(storage_status) - self.monitoring.register_statistic(storage_connection) - self.monitoring.register_statistic(remote_connection) + self.monitoring.register_statistic(pipe_status) + self.monitoring.register_statistic(drone_statistics_caching) def create_job_generator(self, job_input, job_reader): self._job_generators.append((job_input, job_reader)) @@ -101,6 +102,7 @@ async def _simulate(self, end): for controller in self.controllers: while_running.do(controller.run(), volatile=True) while_running.do(self.monitoring.run(), volatile=True) + while_running.do(self.connection.run_pipemonitoring(), volatile=True) self.duration = time.now print( f"[lapis-{monitor.SIMULATION_START}] Finished simulation at {self.duration}" From 621196a8a3c5ed7ca2c54d6f065f4fd93f8daed9 Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Fri, 7 Feb 2020 16:16:43 +0100 Subject: [PATCH 484/648] added classad based scheduler prototype --- lapis/scheduler.py | 311 ++++++++++++++++++++++++++++++--------------- 1 file changed, 205 insertions(+), 106 deletions(-) diff --git a/lapis/scheduler.py b/lapis/scheduler.py index 7c25e6a..48991c5 100644 --- a/lapis/scheduler.py +++ b/lapis/scheduler.py @@ -1,7 +1,14 @@ -from typing import Dict -from usim import Scope, interval, time +from typing import Dict, Iterator, Union, Tuple, List, TypeVar, Generic, Optional +from weakref import WeakKeyDictionary + +from classad import parse +from classad._functions import quantize +from classad._primitives import HTCInt, Undefined +from classad._expression import ClassAd +from usim import Scope, interval, Resources from lapis.drone import Drone +from lapis.job import Job from lapis.monitor import sampling_required @@ -9,6 +16,62 @@ class JobQueue(list): pass +quantization_defaults = { + "memory": HTCInt(128 * 1024 * 1024), + "disk": HTCInt(1024 * 1024), + "cores": HTCInt(1), +} + +DJ = TypeVar("DJ", Drone, Job) + + +class WrappedClassAd(ClassAd, Generic[DJ]): + + __slots__ = "_wrapped" + + def __init__(self, classad: ClassAd, wrapped: DJ): + super(WrappedClassAd, self).__init__() + self._wrapped = wrapped + self._data = classad._data + + def __getitem__(self, item): + def access_wrapped(name, requested=True): + if isinstance(self._wrapped, Drone): + return self._wrapped.theoretical_available_resources[name] + if requested: + return self._wrapped.resources[name] + return self._wrapped.used_resources[name] + + if "target" not in item: + if "requestcpus" in item: + return access_wrapped("cores", requested=True) + elif "requestmemory" in item: + return (1 / 1024 / 1024) * access_wrapped("memory", requested=True) + elif "requestdisk" in item: + return (1 / 1024) * access_wrapped("disk", requested=True) + elif "cpus" in item: + return access_wrapped("cores", requested=False) + elif "memory" in item: + return (1 / 1000 / 1000) * access_wrapped("memory", requested=False) + elif "disk" in item: + return (1 / 1024) * access_wrapped("disk", requested=False) + return super(WrappedClassAd, self).__getitem__(item) + + def __repr__(self): + return f"<{self.__class__.__name__}>: {self._wrapped}" + + def __eq__(self, other): + return super().__eq__(other) and self._wrapped == other._wrapped + + +class Cluster(List[WrappedClassAd[DJ]], Generic[DJ]): + pass + + +class Bucket(List[Cluster[DJ]], Generic[DJ]): + pass + + class CondorJobScheduler(object): """ Goal of the htcondor job scheduler is to have a scheduler that somehow @@ -16,7 +79,6 @@ class CondorJobScheduler(object): Htcondor does scheduling based on a priority queue. The priorities itself are managed by operators of htcondor. So different instances can apparently behave very different. - In my case I am going to try building a priority queue that sorts job slots by increasing cost. The cost itself is calculated based on the current strategy that is used at GridKa. The scheduler checks if a job either @@ -28,87 +90,141 @@ class CondorJobScheduler(object): def __init__(self, job_queue): self._stream_queue = job_queue - self.drone_cluster = [] + self.drone_cluster: Dict[Tuple[float, ...], Cluster[WrappedClassAd[Drone]]] = {} + self.job_cluster: Dict[Tuple[float, ...], Cluster[WrappedClassAd[Job]]] = {} self.interval = 60 self.job_queue = JobQueue() self._collecting = True self._processing = Resources(jobs=0) + # temporary solution + self._wrapped_classads = WeakKeyDictionary() + self._machine_classad = parse( + """ + requirements = target.requestcpus <= my.cpus + pre_job_rank = 1 + rank = 0 + + """ + ) + self._job_classad = parse( + """ + requirements = my.requestcpus <= target.cpus && my.requestmemory <= target.memory + """ + ) + @property - def drone_list(self): - for cluster in self.drone_cluster: + def drone_list(self) -> Iterator[Drone]: + for cluster in self.drone_cluster.values(): for drone in cluster: - yield drone + yield drone._wrapped def register_drone(self, drone: Drone): - self._add_drone(drone) + wrapped_drone = WrappedClassAd(classad=self._machine_classad, wrapped=drone) + self._wrapped_classads[drone] = wrapped_drone + self._add_drone(wrapped_drone) def unregister_drone(self, drone: Drone): - for cluster in self.drone_cluster: + drone_wrapper = self._wrapped_classads[drone] + for key in self.drone_cluster: try: - cluster.remove(drone) + self.drone_cluster[key].remove(drone_wrapper) except ValueError: pass else: - if len(cluster) == 0: - self.drone_cluster.remove(cluster) - - def _add_drone(self, drone: Drone, drone_resources: Dict = None): - minimum_distance_cluster = None - distance = float("Inf") - if len(self.drone_cluster) > 0: - for cluster in self.drone_cluster: - current_distance = 0 - for key in {*cluster[0].pool_resources, *drone.pool_resources}: - if drone_resources: - current_distance += abs( - cluster[0].theoretical_available_resources.get(key, 0) - - drone_resources.get(key, 0) - ) - else: - current_distance += abs( - cluster[0].theoretical_available_resources.get(key, 0) - - drone.theoretical_available_resources.get(key, 0) - ) - if current_distance < distance: - minimum_distance_cluster = cluster - distance = current_distance - if distance < 1: - minimum_distance_cluster.append(drone) - else: - self.drone_cluster.append([drone]) + break else: - self.drone_cluster.append([drone]) + # nothing was removed + return + if len(self.drone_cluster[key]) == 0: + del self.drone_cluster[key] + + @staticmethod + def _clustering_key(resource_dict: Dict): + clustering_key = [] + for key, value in resource_dict.items(): + clustering_key.append( + int(quantize(value, quantization_defaults.get(key, 1))) + ) + return tuple(clustering_key) + + def _add_drone(self, drone: WrappedClassAd, drone_resources: Dict = None): + wrapped_drone = drone._wrapped + if drone_resources: + clustering_key = self._clustering_key(drone_resources) + else: + clustering_key = self._clustering_key(wrapped_drone.available_resources) + self.drone_cluster.setdefault(clustering_key, Cluster()).append(drone) def update_drone(self, drone: Drone): self.unregister_drone(drone) - self._add_drone(drone) + self._add_drone(self._wrapped_classads[drone]) + + def _sort_drone_cluster(self): + return [Bucket(self.drone_cluster.values())] + + def _sort_job_cluster(self): + return Bucket(self.job_cluster.values()) async def run(self): + def filter_drones(job: WrappedClassAd[Job], drone_bucket: Bucket[Drone]): + result = {} # type: Dict[Union[Undefined, float], Bucket[Drone]] + for drones in drone_bucket: + drone = drones[0] # type: WrappedClassAd[Drone] + if job.evaluate( + "requirements", my=job, target=drone + ) and drone.evaluate("requirements", my=drone, target=job): + rank = drone.evaluate("rank", my=job, target=drone) + result.setdefault(rank, Bucket()).append(drones) + return result + + def pop_first( + ranked_drones: Dict[Union[Undefined, float], Bucket[Drone]] + ) -> Optional[WrappedClassAd[Drone]]: + if not ranked_drones: + return None + # print(ranked_drones) + key = sorted(ranked_drones)[0] + values = ranked_drones[key] + # print(key, values) + result = values[0] + values.remove(result) + if not values: + del ranked_drones[key] + try: + return result[0] + except IndexError: + return pop_first(ranked_drones) + async with Scope() as scope: scope.do(self._collect_jobs()) async for _ in interval(self.interval): - print("NEW SCHEDULING INTERVAL @ {}".format(time.now)) - print(self.job_queue) - for job in self.job_queue.copy(): - print("SCHEDULING {}".format(repr(job))) - best_match = self._schedule_job(job) - if best_match: - print( - "start job {} on drone {} @ {}".format( - repr(job), repr(best_match), time.now - ) - ) - scope.do(best_match.start_job(job)) - self.job_queue.remove(job) - await sampling_required.put(self.job_queue) - self.unregister_drone(best_match) - left_resources = best_match.theoretical_available_resources - left_resources = { - key: value - job.resources.get(key, 0) - for key, value in left_resources.items() - } - self._add_drone(best_match, left_resources) + + # TODO: get sorted job cluster [{Job, ...}, ...] + # TODO: get set of drone cluster {{PSlot, ...}, ...} + # TODO: get sorted drone clusters PreJob [{{PSlot, ...}, ...}, ...] + # TODO: filter (Job.Requirements) and sort (Job.Rank) for job and drones => lazy + + all_drone_buckets = self._sort_drone_cluster() + filtered_drones = {} + for jobs in self._sort_job_cluster().copy(): + current_drone_bucket = 0 + for job in jobs: + best_match = pop_first(filtered_drones) + while best_match is None: + # lazily evaluate more PSlots + try: + # TODO: sort filtered_drones + filtered_drones = filter_drones( + job, all_drone_buckets[current_drone_bucket] + ) + except IndexError: + break + current_drone_bucket += 1 + best_match = pop_first(filtered_drones) + else: + # TODO: update drone and check if it gets reinserted to filtered_drones + await self._execute_job(job=job, drone=best_match) if ( not self._collecting and not self.job_queue @@ -117,11 +233,34 @@ async def run(self): break await sampling_required.put(self) + async def _execute_job(self, job: WrappedClassAd, drone: WrappedClassAd): + wrapped_job = job._wrapped + wrapped_drone = drone._wrapped + await wrapped_drone.schedule_job(wrapped_job) + self.job_queue.remove(job) + cluster_key = self._clustering_key(wrapped_job.resources) + self.job_cluster[cluster_key].remove(job) + if len(self.job_cluster[cluster_key]) == 0: + del self.job_cluster[cluster_key] + await sampling_required.put(self.job_queue) + self.unregister_drone(wrapped_drone) + left_resources = { + key: value - wrapped_job.resources.get(key, 0) + for key, value in wrapped_drone.theoretical_available_resources.items() + } + self._add_drone(drone, left_resources) + async def _collect_jobs(self): async for job in self._stream_queue: - self.job_queue.append(job) + wrapped_job = WrappedClassAd(classad=self._job_classad, wrapped=job) + self._wrapped_classads[job] = wrapped_job + self.job_queue.append(wrapped_job) + cluster_key = self._clustering_key(job.resources) + self.job_cluster.setdefault(cluster_key, []).append(wrapped_job) + print(self.job_cluster) await self._processing.increase(jobs=1) # TODO: logging happens with each job + # TODO: job queue to the outside now contains wrapped classads... await sampling_required.put(self.job_queue) self._collecting = False @@ -129,48 +268,8 @@ async def job_finished(self, job): if job.successful: await self._processing.decrease(jobs=1) else: - await self._stream_queue.put(job) - - def _schedule_job(self, job) -> Drone: - priorities = {} - for cluster in self.drone_cluster: - drone = cluster[0] - cost = 0 - resources = drone.theoretical_available_resources - # print( - # "trying to match Job {} to {}, resources {}".format( - # repr(job), repr(drone), resources - # ) - # ) - for resource_type in job.resources: - if resources.get(resource_type, 0) < job.resources[resource_type]: - # Inf for all job resources that a drone does not support - # and all resources that are too small to even be considered - cost = float("Inf") - break - else: - try: - cost += 1 / ( - resources[resource_type] // job.resources[resource_type] - ) - except KeyError: - pass - for additional_resource_type in [ - key for key in drone.pool_resources if key not in job.resources - ]: - cost += resources[additional_resource_type] - cost /= len((*job.resources, *drone.pool_resources)) - if cost <= 1: - # directly start job - return drone - try: - priorities[cost].append(drone) - except KeyError: - priorities[cost] = [drone] - try: - minimal_key = min(priorities) - if minimal_key < float("Inf"): - return priorities[minimal_key][0] - except ValueError: - pass - return None + self.job_queue.append(self._wrapped_classads[job]) + cluster_key = self._clustering_key(job.resources) + self.job_cluster.setdefault(cluster_key, []).append( + self._wrapped_classads[job] + ) From b94eac4eec18305f56480900e776c0eedc4c364a Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Fri, 7 Feb 2020 16:17:20 +0100 Subject: [PATCH 485/648] introduced new attributes for caching functionality monitoring --- lapis/job.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/lapis/job.py b/lapis/job.py index 33bf108..339c64e 100644 --- a/lapis/job.py +++ b/lapis/job.py @@ -25,6 +25,10 @@ class Job(object): "drone", "_success", "calculation_efficiency", + "__weakref__", + "_coordinated", + "_used_cache", + "_total_input_data", ) def __init__( @@ -75,6 +79,11 @@ def __init__( # caching-related self.requested_inputfiles = resources.pop("inputfiles", None) self.used_inputfiles = used_resources.pop("inputfiles", None) + self._coordinated = 0 + self._used_cache = 0 + self._total_input_data = sum( + [fileinfo["usedsize"] for fileinfo in self.used_inputfiles.values()] + ) @property def name(self) -> str: @@ -88,7 +97,7 @@ def successful(self) -> Optional[bool]: def waiting_time(self) -> float: """ The time the job spent in the simulators scheduling queue. `Inf` when - the job is still waitiing. + the job is still waiting. :return: Time in queue """ @@ -163,7 +172,7 @@ async def run(self, drone: "Drone"): print(f"monitored walltime of {old_walltime} changed to {self.walltime}") self.drone = None self._success = True - await sampling_required.put(self) + await sampling_required.put(self) def __repr__(self): return "<%s: %s>" % (self.__class__.__name__, self._name or id(self)) From 5a4e8bf44cf26f4fdfc887411ccfb7cbf2f84823 Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Fri, 7 Feb 2020 16:17:37 +0100 Subject: [PATCH 486/648] adjusted types --- lapis/files.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lapis/files.py b/lapis/files.py index e227447..b43c8fb 100644 --- a/lapis/files.py +++ b/lapis/files.py @@ -50,5 +50,5 @@ def convert_to_stored_file_object(self, currenttime): class RequestedFile_HitrateBased(NamedTuple): filename: str - filesize: float - cachehitrate: dict + filesize: int + cachehitrate: int From 009310aa38ed8e74bb1e8652273fb8001d758d18 Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Fri, 7 Feb 2020 16:18:40 +0100 Subject: [PATCH 487/648] added functionality to handle resource levels of 0 --- lapis/drone.py | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/lapis/drone.py b/lapis/drone.py index ffe8c61..e802de2 100644 --- a/lapis/drone.py +++ b/lapis/drone.py @@ -34,6 +34,7 @@ def __init__( self.resources = Capacities(**pool_resources) # shadowing requested resources to determine jobs to be killed self.used_resources = Capacities(**pool_resources) + if ignore_resources: self._valid_resource_keys = [ resource @@ -49,6 +50,9 @@ def __init__( self._utilisation = None self._job_queue = Queue() + # caching-related + self.jobs_using_caching = 0 + @property def theoretical_available_resources(self): return dict(self.resources.levels) @@ -96,9 +100,15 @@ def _init_allocation_and_utilisation(self): levels = self.resources.levels resources = [] for resource_key in self._valid_resource_keys: - resources.append( - getattr(levels, resource_key) / self.pool_resources[resource_key] - ) + if ( + getattr(levels, resource_key) == 0 + and self.pool_resources[resource_key] == 0 + ): + pass + else: + resources.append( + getattr(levels, resource_key) / self.pool_resources[resource_key] + ) self._allocation = max(resources) self._utilisation = min(resources) @@ -150,7 +160,7 @@ async def _run_job(self, job: Job, kill: bool): except KeyError: # check is not relevant if the data is not stored pass - self.scheduler.update_drone(self) + # self.scheduler.update_drone(self) await job_execution.done print( "finished job {} on drone {} @ {}".format( @@ -172,4 +182,4 @@ async def _run_job(self, job: Job, kill: bool): await sampling_required.put(self) def __repr__(self): - return "<%s: %s>" % (self.__class__.__name__, id(self)) + return "<%s: %s %s>" % (self.__class__.__name__, id(self), self.sitename) From 2966f1d7a44a871dc494a754856cc8c565340fa3 Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Fri, 7 Feb 2020 16:19:15 +0100 Subject: [PATCH 488/648] added monitored pipes and job wise hitrate definition --- lapis/connection.py | 91 ++++++++++++++++++++++++++++++++------------- 1 file changed, 65 insertions(+), 26 deletions(-) diff --git a/lapis/connection.py b/lapis/connection.py index 9a677a3..56c7664 100644 --- a/lapis/connection.py +++ b/lapis/connection.py @@ -1,7 +1,8 @@ import random from typing import Union, Optional -from usim import Scope, time, Pipe +from usim import Scope, time +from monitoredpipe import MonitoredPipe from lapis.cachealgorithm import ( CacheAlgorithm, @@ -12,15 +13,16 @@ from lapis.storageelement import StorageElement, RemoteStorage from lapis.files import RequestedFile, RequestedFile_HitrateBased from lapis.monitor import sampling_required +from lapis.monitor.caching import MonitoredPipeInfo class Connection(object): __slots__ = ("storages", "remote_connection", "caching_algorithm") - def __init__(self, throughput=100): + def __init__(self, throughput=1000 * 1000 * 1000): self.storages = dict() - self.remote_connection = RemoteStorage(Pipe(throughput=throughput)) + self.remote_connection = RemoteStorage(MonitoredPipe(throughput=throughput)) self.caching_algorithm = CacheAlgorithm( caching_strategy=lambda file, storage: check_size(file, storage) and check_relevance(file, storage), @@ -29,6 +31,28 @@ def __init__(self, throughput=100): ), ) + async def run_pipemonitoring(self): + async def report_load_to_monitoring(pipe: MonitoredPipe): + async for throughput in pipe.load(): + await sampling_required.put( + MonitoredPipeInfo( + throughput, + pipe.throughput, + repr(pipe), + pipe._throughput_scale, + len(pipe._subscriptions), + ) + ) + print( + f"{time.now:6.0f}: {throughput} \t [{throughput / pipe.throughput * 100:03.0f}%]" + ) + + async with Scope() as scope: + scope.do(report_load_to_monitoring(self.remote_connection.connection)) + for storage_key, storage_list in self.storages.items(): + for storage in storage_list: + scope.do(report_load_to_monitoring(storage.connection)) + def add_storage_element(self, storage_element: StorageElement): """ Register storage element in Connetion module clustering storage elements by @@ -90,7 +114,7 @@ async def stream_file( used_connection = await self._determine_inputfile_source( requested_file, dronesite, job_repr ) - await sampling_required.put(used_connection) + # await sampling_required.put(used_connection) if used_connection == self.remote_connection and self.storages.get( dronesite, None ): @@ -113,16 +137,16 @@ async def stream_file( pass print(f"now transfering {requested_file.filesize} from {used_connection}") await used_connection.transfer(requested_file, job_repr=job_repr) - print( - "Job {}: finished transfering of file {}: {}GB @ {}".format( - job_repr, requested_file.filename, requested_file.filesize, time.now - ) - ) + # print( + # "Job {}: finished transfering of file {}: {}B @ {}".format( + # job_repr, requested_file.filename, requested_file.filesize, time.now + # ) + # ) async def transfer_files(self, drone, requested_files: dict, job_repr=None): """ Converts dict information about requested files to RequestedFile object and - parallely launches streaming for all files + sequentially streams all files :param drone: :param requested_files: :param job_repr: @@ -130,21 +154,36 @@ async def transfer_files(self, drone, requested_files: dict, job_repr=None): """ start_time = time.now - async with Scope() as scope: - for inputfilename, inputfilespecs in requested_files.items(): - if "hitrates" in inputfilespecs.keys(): - requested_file = RequestedFile_HitrateBased( - inputfilename, - inputfilespecs["usedsize"], - inputfilespecs["hitrates"], - ) - else: - requested_file = RequestedFile( - inputfilename, inputfilespecs["usedsize"] - ) - scope.do(self.stream_file(requested_file, drone.sitename, job_repr)) + + # decision if a jobs inputfiles are cached based on hitrate + random_inputfile_information = next(iter(requested_files.values())) + if "hitrates" in random_inputfile_information.keys(): + hitrate = sum( + [ + file["usedsize"] * file["hitrates"].get(drone.sitename, 0.0) + for file in requested_files.values() + ] + ) / sum([file["usedsize"] for file in requested_files.values()]) + provides_file = int(random.random() < hitrate) + print( + "{} on {} hitrate {} => {}".format( + requested_files, drone.sitename, hitrate, provides_file + ) + ) + + for inputfilename, inputfilespecs in requested_files.items(): + if "hitrates" in inputfilespecs.keys(): + requested_file = RequestedFile_HitrateBased( + inputfilename, inputfilespecs["usedsize"], provides_file + ) + + else: + requested_file = RequestedFile( + inputfilename, inputfilespecs["usedsize"] + ) + await self.stream_file(requested_file, drone.sitename, job_repr) stream_time = time.now - start_time - print( - "STREAMED files {} in {}".format(list(requested_files.keys()), stream_time) - ) + # print( + # "STREAMED files {} in {}".format(list(requested_files.keys()), stream_time) + # ) return stream_time From 2613e77b9e2f354704c939087277c3dadaa97e0c Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Fri, 7 Feb 2020 16:22:25 +0100 Subject: [PATCH 489/648] added script to execute simulation without using CLI --- custom_simulate.py | 131 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 131 insertions(+) create mode 100644 custom_simulate.py diff --git a/custom_simulate.py b/custom_simulate.py new file mode 100644 index 0000000..4ffade7 --- /dev/null +++ b/custom_simulate.py @@ -0,0 +1,131 @@ +from functools import partial + +import logging.handlers + +from cobald.monitor.format_json import JsonFormatter +from cobald.monitor.format_line import LineProtocolFormatter + +from lapis.job_io.htcondor import htcondor_job_reader +from lapis.pool import StaticPool +from lapis.pool_io.htcondor import htcondor_pool_reader +from lapis.job_io.swf import swf_job_reader +from lapis.storageelement import FileBasedHitrateStorage +from lapis.storage_io.storage import ( + storage_reader, + storage_reader_filebased_hitrate_caching, +) + +from lapis.scheduler import CondorJobScheduler +from lapis.simulator import Simulator + + +from lapis.monitor import LoggingUDPSocketHandler, SimulationTimeFilter + +from time import time + + +last_step = 0 + +job_import_mapper = {"htcondor": htcondor_job_reader, "swf": swf_job_reader} + +pool_import_mapper = {"htcondor": htcondor_pool_reader} + +storage_import_mapper = { + "standard": storage_reader, + "filehitrate": storage_reader_filebased_hitrate_caching, +} + + +def ini_and_run( + job_file, + pool_files, + storage_file, + storage_type, + log_file="test_{}.log".format(time()), + remote_throughput=1.0, + seed=1234, + until=None, + calculation_efficiency=1.0, + log_telegraf=False, +): + # ini logging to file + monitoring_logger = logging.getLogger() + monitoring_logger.setLevel(logging.DEBUG) + time_filter = SimulationTimeFilter() + monitoring_logger.addFilter(time_filter) + streamHandler = logging.StreamHandler(stream=open(log_file, "w")) + streamHandler.setFormatter(JsonFormatter()) + monitoring_logger.addHandler(streamHandler) + + if log_telegraf: + telegrafHandler = LoggingUDPSocketHandler( + "localhost", logging.handlers.DEFAULT_UDP_LOGGING_PORT + ) + telegrafHandler.setFormatter(LineProtocolFormatter(resolution=1)) + monitoring_logger.addHandler(telegrafHandler) + + # ini simulation + print("starting static environment") + simulator = Simulator(seed=time()) + file_type = "htcondor" + file = job_file + # print() + # input() + simulator.create_job_generator( + job_input=open(file, "r"), + job_reader=partial( + job_import_mapper[file_type], calculation_efficiency=calculation_efficiency + ), + ) + simulator.create_scheduler(scheduler_type=CondorJobScheduler) + + simulator.create_connection_module(remote_throughput * 1024 * 1024 * 1024) + with open(storage_file, "r") as storage_file: + simulator.create_storage( + storage_input=storage_file, + storage_content_input=None, + storage_reader=storage_import_mapper[storage_type], + storage_type=FileBasedHitrateStorage, + ) + + for pool_file in pool_files: + with open(pool_file, "r") as pool_file: + pool_file_type = "htcondor" + simulator.create_pools( + pool_input=pool_file, + pool_reader=pool_import_mapper[pool_file_type], + pool_type=StaticPool, + ) + simulator.enable_monitoring() + + # run simulation + simulator.run(until=until) + + +# job_file = "/home/tabea/work/testdata/hitratebased/job_list_minimal.json" +job_file = "/home/tabea/work/testdata/hitratebased/job_list_minimal_only_cpu.json" +# pool_files = ["/home/tabea/work/testdata/hitratebased/sg_machines.csv", +# "/home/tabea/work/testdata/hitratebased/dummycluster.csv"] +# storage_file = "/home/tabea/work/testdata/hitratebased/sg_caches.csv" +# storage_type = "filehitrate" +# +# ini_and_run(job_file=job_file, pool_files=pool_files, storage_file=storage_file, +# storage_type=storage_type, log_file="minimal_hitratebased_test.log", +# log_telegraf=True) + +# job_file = "/home/tabea/work/testdata/hitratebased/testjobs.json" +# job_file = "/home/tabea/work/testdata/hitratebased/week.json" +# job_file = "/home/tabea/work/testdata/hitratebased/day_jobinput.json" +# job_file = "/home/tabea/work/testdata/hitratebased/week_1_sample_time_jobinput.json" +pool_files = ["/home/tabea/work/testdata/hitratebased/sg_machines_only_cpu.csv"] +# "/home/tabea/work/testdata/hitratebased/dummycluster.csv"] +storage_file = "/home/tabea/work/testdata/hitratebased/sg_caches.csv" +storage_type = "filehitrate" +ini_and_run( + job_file=job_file, + pool_files=pool_files, + storage_file=storage_file, + storage_type=storage_type, + log_file="minimal_hitratebased_test.log", + log_telegraf=True, +) From 5948a55bcb1cd20bb5600f504f9870d11262b435 Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Fri, 7 Feb 2020 16:22:56 +0100 Subject: [PATCH 490/648] added pipe class with monitoring functionality --- monitoredpipe.py | 58 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) create mode 100644 monitoredpipe.py diff --git a/monitoredpipe.py b/monitoredpipe.py new file mode 100644 index 0000000..50e096c --- /dev/null +++ b/monitoredpipe.py @@ -0,0 +1,58 @@ +from usim import Pipe, instant +from usim._primitives.notification import Notification + + +class MonitoredPipe(Pipe): + def __init__(self, throughput: float): + super().__init__(throughput) + self._monitor = Notification() + self.storage = None + + async def load(self): + """ + Monitor any changes of the throughput load of the pipe + .. code:: python3 + async def report_load(pipe: MonitoredPipe): + async for throughput in pipe.load(): + print(f'{time.now:6.0f}: {throughput} \t [{throughput / pipe.throughput * 100:03.0f}%]') + .. note:: + Currently only works for loads exceeding 100%. + """ + await instant + + yield sum(self._subscriptions.values()) + while True: + await self._monitor + yield sum(self._subscriptions.values()) + + def _throttle_subscribers(self): + self._monitor.__awake_all__() + super()._throttle_subscribers() + + def __repr__(self): + return "<%s: %s>" % (self.__class__.__name__, self.storage or id(self)) + + +if __name__ == "__main__": + from usim import time, run, Scope + + async def report_load(pipe: MonitoredPipe): + async for throughput in pipe.load(): + print( + f"{time.now:6.0f}: {throughput} \t [{throughput / pipe.throughput * 100:03.0f}%]" + ) + + async def perform_load(pipe: MonitoredPipe, delay, amount): + await (time + delay) + await pipe.transfer(amount, pipe.throughput / 2) + + async def main(): + pipe = MonitoredPipe(128) + async with Scope() as scope: + scope.do(report_load(pipe), volatile=True) + scope.do(perform_load(pipe, 0, 512)) + scope.do(perform_load(pipe, 4, 1024)) + scope.do(perform_load(pipe, 6, 128)) + scope.do(perform_load(pipe, 12, 1024)) + + run(main()) From 0bef0ef379e299aa40e68487421dcc0b931033c9 Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Fri, 7 Feb 2020 16:23:14 +0100 Subject: [PATCH 491/648] archive: old scheduler --- lapis/scheduler_old.py | 176 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 176 insertions(+) create mode 100644 lapis/scheduler_old.py diff --git a/lapis/scheduler_old.py b/lapis/scheduler_old.py new file mode 100644 index 0000000..22bbe60 --- /dev/null +++ b/lapis/scheduler_old.py @@ -0,0 +1,176 @@ +from typing import Dict +from usim import Scope, interval, Resources, time + +from lapis.drone import Drone +from lapis.monitor import sampling_required + + +class JobQueue(list): + pass + + +class CondorJobScheduler(object): + """ + Goal of the htcondor job scheduler is to have a scheduler that somehow + mimics how htcondor does schedule jobs. + Htcondor does scheduling based on a priority queue. The priorities itself + are managed by operators of htcondor. + So different instances can apparently behave very different. + + In my case I am going to try building a priority queue that sorts job slots + by increasing cost. The cost itself is calculated based on the current + strategy that is used at GridKa. The scheduler checks if a job either + exactly fits a slot or if it does fit into it several times. The cost for + putting a job at a given slot is given by the amount of resources that + might remain unallocated. + :return: + """ + + def __init__(self, job_queue): + self._stream_queue = job_queue + self.drone_cluster = [] + self.interval = 60 + self.job_queue = JobQueue() + self._collecting = True + self._processing = Resources(jobs=0) + + @property + def drone_list(self): + for cluster in self.drone_cluster: + for drone in cluster: + yield drone + + def register_drone(self, drone: Drone): + self._add_drone(drone) + + def unregister_drone(self, drone: Drone): + for cluster in self.drone_cluster: + try: + cluster.remove(drone) + except ValueError: + pass + else: + if len(cluster) == 0: + self.drone_cluster.remove(cluster) + + def _add_drone(self, drone: Drone, drone_resources: Dict = None): + minimum_distance_cluster = None + distance = float("Inf") + if len(self.drone_cluster) > 0: + for cluster in self.drone_cluster: + current_distance = 0 + for key in {*cluster[0].pool_resources, *drone.pool_resources}: + if drone_resources: + current_distance += abs( + cluster[0].theoretical_available_resources.get(key, 0) + - drone_resources.get(key, 0) + ) + else: + current_distance += abs( + cluster[0].theoretical_available_resources.get(key, 0) + - drone.theoretical_available_resources.get(key, 0) + ) + if current_distance < distance: + minimum_distance_cluster = cluster + distance = current_distance + if distance < 1: + minimum_distance_cluster.append(drone) + else: + self.drone_cluster.append([drone]) + else: + self.drone_cluster.append([drone]) + + def update_drone(self, drone: Drone): + self.unregister_drone(drone) + self._add_drone(drone) + + async def run(self): + async with Scope() as scope: + scope.do(self._collect_jobs()) + async for _ in interval(self.interval): + print("NEW SCHEDULING INTERVAL @ {}".format(time.now)) + print(self.job_queue) + for job in self.job_queue.copy(): + print("SCHEDULING {}".format(repr(job))) + best_match = self._schedule_job(job) + if best_match: + print( + "start job {} on drone {} @ {}".format( + repr(job), repr(best_match), time.now + ) + ) + await best_match.schedule_job(job) + self.job_queue.remove(job) + await sampling_required.put(self.job_queue) + self.unregister_drone(best_match) + left_resources = best_match.theoretical_available_resources + left_resources = { + key: value - job.resources.get(key, 0) + for key, value in left_resources.items() + } + self._add_drone(best_match, left_resources) + if ( + not self._collecting + and not self.job_queue + and self._processing.levels.jobs == 0 + ): + break + await sampling_required.put(self) + + async def _collect_jobs(self): + async for job in self._stream_queue: + self.job_queue.append(job) + await self._processing.increase(jobs=1) + # TODO: logging happens with each job + await sampling_required.put(self.job_queue) + self._collecting = False + + async def job_finished(self, job): + if job.successful: + await self._processing.decrease(jobs=1) + else: + await self._stream_queue.put(job) + + def _schedule_job(self, job) -> Drone: + priorities = {} + for cluster in self.drone_cluster: + drone = cluster[0] + cost = 0 + resources = drone.theoretical_available_resources + # print( + # "trying to match Job {} to {}, resources {}".format( + # repr(job), repr(drone), resources + # ) + # ) + for resource_type in job.resources: + if resources.get(resource_type, 0) < job.resources[resource_type]: + # Inf for all job resources that a drone does not support + # and all resources that are too small to even be considered + cost = float("Inf") + break + else: + try: + cost += 1 / ( + resources[resource_type] // job.resources[resource_type] + ) + except KeyError: + pass + for additional_resource_type in [ + key for key in drone.pool_resources if key not in job.resources + ]: + cost += resources[additional_resource_type] + cost /= len((*job.resources, *drone.pool_resources)) + if cost <= 1: + # directly start job + return drone + try: + priorities[cost].append(drone) + except KeyError: + priorities[cost] = [drone] + try: + minimal_key = min(priorities) + if minimal_key < float("Inf"): + return priorities[minimal_key][0] + except ValueError: + pass + return None From 3fb8def838b9d5903a259b01ea5eb5928bfa0faf Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Tue, 11 Feb 2020 21:12:09 +0100 Subject: [PATCH 492/648] added custom_simulate version for usage with batch system --- custom_simulate_batchsystem.py | 116 +++++++++++++++++++++++++++++++++ 1 file changed, 116 insertions(+) create mode 100644 custom_simulate_batchsystem.py diff --git a/custom_simulate_batchsystem.py b/custom_simulate_batchsystem.py new file mode 100644 index 0000000..032d6b7 --- /dev/null +++ b/custom_simulate_batchsystem.py @@ -0,0 +1,116 @@ +from functools import partial + +import logging.handlers + +from cobald.monitor.format_json import JsonFormatter +from cobald.monitor.format_line import LineProtocolFormatter + +from lapis.job_io.htcondor import htcondor_job_reader +from lapis.pool import StaticPool +from lapis.pool_io.htcondor import htcondor_pool_reader +from lapis.job_io.swf import swf_job_reader +from lapis.storageelement import FileBasedHitrateStorage +from lapis.storage_io.storage import ( + storage_reader, + storage_reader_filebased_hitrate_caching, +) + +from lapis.scheduler import CondorJobScheduler +from lapis.simulator import Simulator + +import sys + +from lapis.monitor import LoggingUDPSocketHandler, SimulationTimeFilter + + +last_step = 0 + +job_import_mapper = {"htcondor": htcondor_job_reader, "swf": swf_job_reader} + +pool_import_mapper = {"htcondor": htcondor_pool_reader} + +storage_import_mapper = { + "standard": storage_reader, + "filehitrate": storage_reader_filebased_hitrate_caching, +} + + +def ini_and_run( + job_file, + pool_files, + storage_file, + storage_type, + log_file="test.log", + remote_throughput=1.0, + seed=1234, + until=None, + calculation_efficiency=1.0, + log_telegraf=False, +): + logging.getLogger("implementation").info( + job_file, pool_files, storage_file, log_file + ) + # ini logging to file + monitoring_logger = logging.getLogger() + monitoring_logger.setLevel(logging.DEBUG) + time_filter = SimulationTimeFilter() + monitoring_logger.addFilter(time_filter) + streamHandler = logging.StreamHandler(stream=open(log_file, "w")) + streamHandler.setFormatter(JsonFormatter()) + monitoring_logger.addHandler(streamHandler) + + if log_telegraf: + telegrafHandler = LoggingUDPSocketHandler( + "localhost", logging.handlers.DEFAULT_UDP_LOGGING_PORT + ) + telegrafHandler.setFormatter(LineProtocolFormatter(resolution=1)) + monitoring_logger.addHandler(telegrafHandler) + + # ini simulation + print("starting static environment") + simulator = Simulator(seed=seed) + file_type = "htcondor" + file = job_file + # print() + # input() + simulator.create_job_generator( + job_input=open(file, "r"), + job_reader=partial( + job_import_mapper[file_type], calculation_efficiency=calculation_efficiency + ), + ) + simulator.create_scheduler(scheduler_type=CondorJobScheduler) + + simulator.create_connection_module(remote_throughput * 1024 * 1024 * 1024) + with open(storage_file, "r") as storage_file: + simulator.create_storage( + storage_input=storage_file, + storage_content_input=None, + storage_reader=storage_import_mapper[storage_type], + storage_type=FileBasedHitrateStorage, + ) + + for pool_file in pool_files: + with open(pool_file, "r") as pool_file: + pool_file_type = "htcondor" + simulator.create_pools( + pool_input=pool_file, + pool_reader=pool_import_mapper[pool_file_type], + pool_type=StaticPool, + ) + simulator.enable_monitoring() + + # run simulation + simulator.run(until=until) + + +ini_and_run( + job_file=sys.argv[1], + pool_files=[sys.argv[2], sys.argv[3]], + storage_file=sys.argv[4], + storage_type="filehitrate", + log_file=sys.argv[5], + remote_throughput=sys.argv[6], + calculation_efficiency=sys.argv[7], + log_telegraf=False, +) From f42749e98fb1bb73f2226e7634331ad90861267b Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Tue, 11 Feb 2020 21:13:04 +0100 Subject: [PATCH 493/648] fixed definition of a jobs total input data to work with jobs without input data --- lapis/job.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/lapis/job.py b/lapis/job.py index 339c64e..e94d05e 100644 --- a/lapis/job.py +++ b/lapis/job.py @@ -81,9 +81,12 @@ def __init__( self.used_inputfiles = used_resources.pop("inputfiles", None) self._coordinated = 0 self._used_cache = 0 - self._total_input_data = sum( - [fileinfo["usedsize"] for fileinfo in self.used_inputfiles.values()] - ) + try: + self._total_input_data = sum( + [fileinfo["usedsize"] for fileinfo in self.used_inputfiles.values()] + ) + except AttributeError: + self._total_input_data = 0 @property def name(self) -> str: From b9a24684ab771d54b6a87bbed7ac73d259d24657 Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Tue, 11 Feb 2020 21:14:00 +0100 Subject: [PATCH 494/648] fixed bug in job import --- lapis/job_io/htcondor.py | 1 + 1 file changed, 1 insertion(+) diff --git a/lapis/job_io/htcondor.py b/lapis/job_io/htcondor.py index cb50b1d..d57920b 100644 --- a/lapis/job_io/htcondor.py +++ b/lapis/job_io/htcondor.py @@ -81,6 +81,7 @@ def htcondor_job_reader( try: if not entry["Inputfiles"]: del entry["Inputfiles"] + raise KeyError resources["inputfiles"] = deepcopy(entry["Inputfiles"]) used_resources["inputfiles"] = deepcopy(entry["Inputfiles"]) for filename, filespecs in entry["Inputfiles"].items(): From f56de6d0e8d61b9e5e98f6bfd83f97ca9efd6e0f Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Tue, 11 Feb 2020 21:55:55 +0100 Subject: [PATCH 495/648] fixed typing of bash variables --- custom_simulate_batchsystem.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/custom_simulate_batchsystem.py b/custom_simulate_batchsystem.py index 032d6b7..fc98a56 100644 --- a/custom_simulate_batchsystem.py +++ b/custom_simulate_batchsystem.py @@ -47,9 +47,6 @@ def ini_and_run( calculation_efficiency=1.0, log_telegraf=False, ): - logging.getLogger("implementation").info( - job_file, pool_files, storage_file, log_file - ) # ini logging to file monitoring_logger = logging.getLogger() monitoring_logger.setLevel(logging.DEBUG) @@ -110,7 +107,7 @@ def ini_and_run( storage_file=sys.argv[4], storage_type="filehitrate", log_file=sys.argv[5], - remote_throughput=sys.argv[6], - calculation_efficiency=sys.argv[7], + remote_throughput=float(sys.argv[6]), + calculation_efficiency=float(sys.argv[7]), log_telegraf=False, ) From 1ec0b50e1d9a5d3a59d7df15404317f89f82b971 Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Wed, 12 Feb 2020 14:12:13 +0100 Subject: [PATCH 496/648] reduced debug output --- lapis/connection.py | 16 +++++------ lapis/drone.py | 10 +++---- lapis/job.py | 28 +++++++++---------- lapis/scheduler.py | 1 - lapis/storageelement.py | 62 ++++++++++++++++++++--------------------- 5 files changed, 58 insertions(+), 59 deletions(-) diff --git a/lapis/connection.py b/lapis/connection.py index 56c7664..8b4e015 100644 --- a/lapis/connection.py +++ b/lapis/connection.py @@ -43,9 +43,9 @@ async def report_load_to_monitoring(pipe: MonitoredPipe): len(pipe._subscriptions), ) ) - print( - f"{time.now:6.0f}: {throughput} \t [{throughput / pipe.throughput * 100:03.0f}%]" - ) + # print( + # f"{time.now:6.0f}: {throughput} \t [{throughput / pipe.throughput * 100:03.0f}%]" + # ) async with Scope() as scope: scope.do(report_load_to_monitoring(self.remote_connection.connection)) @@ -165,11 +165,11 @@ async def transfer_files(self, drone, requested_files: dict, job_repr=None): ] ) / sum([file["usedsize"] for file in requested_files.values()]) provides_file = int(random.random() < hitrate) - print( - "{} on {} hitrate {} => {}".format( - requested_files, drone.sitename, hitrate, provides_file - ) - ) + # print( + # "{} on {} hitrate {} => {}".format( + # requested_files, drone.sitename, hitrate, provides_file + # ) + # ) for inputfilename, inputfilespecs in requested_files.items(): if "hitrates" in inputfilespecs.keys(): diff --git a/lapis/drone.py b/lapis/drone.py index e802de2..c3e07d7 100644 --- a/lapis/drone.py +++ b/lapis/drone.py @@ -162,11 +162,11 @@ async def _run_job(self, job: Job, kill: bool): pass # self.scheduler.update_drone(self) await job_execution.done - print( - "finished job {} on drone {} @ {}".format( - repr(job), repr(self), time.now - ) - ) + # print( + # "finished job {} on drone {} @ {}".format( + # repr(job), repr(self), time.now + # ) + # ) except ResourcesUnavailable: await instant job_execution.cancel() diff --git a/lapis/job.py b/lapis/job.py index e94d05e..c656d52 100644 --- a/lapis/job.py +++ b/lapis/job.py @@ -115,11 +115,11 @@ async def _calculate(self): :param calculation_efficiency: :return: """ - print( - f"WALLTIME: Job {self} @ {time.now}, " - f"{self.used_resources.get('cores', None)}, " - f"{self.calculation_efficiency}" - ) + # print( + # f"WALLTIME: Job {self} @ {time.now}, " + # f"{self.used_resources.get('cores', None)}, " + # f"{self.calculation_efficiency}" + # ) result = self.walltime try: result = ( @@ -127,23 +127,23 @@ async def _calculate(self): ) * self.walltime except (KeyError, TypeError): pass - start = time.now + # start = time.now await (time + result) - print(f"finished calculation at {time.now - start}") + # print(f"finished calculation at {time.now - start}") async def _transfer_inputfiles(self): try: - start = time.now - print(f"TRANSFERING INPUTFILES: Job {self} @ {start}") + # start = time.now + # print(f"TRANSFERING INPUTFILES: Job {self} @ {start}") await self.drone.connection.transfer_files( drone=self.drone, requested_files=self.used_inputfiles, job_repr=repr(self), ) - print( - f"streamed inputfiles {self.used_inputfiles.keys()} for job {self} " - f"in {time.now - start} timeunits, finished @ {time.now}" - ) + # print( + # f"streamed inputfiles {self.used_inputfiles.keys()} for job {self} " + # f"in {time.now - start} timeunits, finished @ {time.now}" + # ) except AttributeError: pass @@ -153,7 +153,7 @@ async def run(self, drone: "Drone"): self.in_queue_until = time.now self._success = None await sampling_required.put(self) - print("running job {} in drone {}".format(repr(self), repr(self.drone))) + # print("running job {} in drone {}".format(repr(self), repr(self.drone))) try: start = time.now async with Scope() as scope: diff --git a/lapis/scheduler.py b/lapis/scheduler.py index 48991c5..972657c 100644 --- a/lapis/scheduler.py +++ b/lapis/scheduler.py @@ -257,7 +257,6 @@ async def _collect_jobs(self): self.job_queue.append(wrapped_job) cluster_key = self._clustering_key(job.resources) self.job_cluster.setdefault(cluster_key, []).append(wrapped_job) - print(self.job_cluster) await self._processing.increase(jobs=1) # TODO: logging happens with each job # TODO: job queue to the outside now contains wrapped classads... diff --git a/lapis/storageelement.py b/lapis/storageelement.py index 55bf6e1..0d3914d 100644 --- a/lapis/storageelement.py +++ b/lapis/storageelement.py @@ -166,21 +166,21 @@ def find(self, requested_file: RequestedFile, job_repr=None): :param job_repr: Needed for debug output, will be replaced :return: (amount of cached data, storage object) """ - print( - "LOOK UP FILE: Job {}, File {}, Storage {} @ {}".format( - job_repr, requested_file.filename, repr(self), time.now - ) - ) + # print( + # "LOOK UP FILE: Job {}, File {}, Storage {} @ {}".format( + # job_repr, requested_file.filename, repr(self), time.now + # ) + # ) try: result = LookUpInformation( self.files[requested_file.filename].filesize, self ) except KeyError: - print( - "File {} not cached on any reachable storage".format( - requested_file.filename - ) - ) + # print( + # "File {} not cached on any reachable storage".format( + # requested_file.filename + # ) + # ) result = LookUpInformation(0, self) return result @@ -216,16 +216,16 @@ def used(self): return 0 async def transfer(self, file: RequestedFile, job_repr=None): - print( - "TRANSFER: {}, filesize {}, remote: {}/{}, cache: {}/{}".format( - self._hitrate, - file.filesize, - (1 - self._hitrate) * file.filesize, - self.remote_storage.connection.throughput, - self._hitrate * file.filesize, - self.connection.throughput, - ) - ) + # print( + # "TRANSFER: {}, filesize {}, remote: {}/{}, cache: {}/{}".format( + # self._hitrate, + # file.filesize, + # (1 - self._hitrate) * file.filesize, + # self.remote_storage.connection.throughput, + # self._hitrate * file.filesize, + # self.connection.throughput, + # ) + # ) async with Scope() as scope: logging.getLogger("implementation").warning( "{} {} @ {} in {}".format( @@ -278,17 +278,17 @@ def used(self): return 0 async def transfer(self, file: RequestedFile_HitrateBased, job_repr=None): - print( - "TRANSFER: on {} with {}, filesize {}, remote: {}/{}, cache: {}/{}".format( - self.name, - file.cachehitrate, - file.filesize, - (1 - file.cachehitrate) * file.filesize, - self.remote_storage.connection.throughput, - file.cachehitrate * file.filesize, - self.connection.throughput, - ) - ) + # print( + # "TRANSFER: on {} with {}, filesize {}, remote: {}/{}, cache: {}/{}".format( + # self.name, + # file.cachehitrate, + # file.filesize, + # (1 - file.cachehitrate) * file.filesize, + # self.remote_storage.connection.throughput, + # file.cachehitrate * file.filesize, + # self.connection.throughput, + # ) + # ) if file.cachehitrate: await self.connection.transfer(total=file.filesize) else: From 04950f0aa14800209e30c513c7c91211637585a5 Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Wed, 12 Feb 2020 20:23:54 +0100 Subject: [PATCH 497/648] fixed ZeroDivisionError in calculation of job input hitrate --- lapis/connection.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/lapis/connection.py b/lapis/connection.py index 8b4e015..2a604c3 100644 --- a/lapis/connection.py +++ b/lapis/connection.py @@ -135,7 +135,7 @@ async def stream_file( ) except KeyError: pass - print(f"now transfering {requested_file.filesize} from {used_connection}") + # print(f"now transfering {requested_file.filesize} from {used_connection}") await used_connection.transfer(requested_file, job_repr=job_repr) # print( # "Job {}: finished transfering of file {}: {}B @ {}".format( @@ -158,13 +158,16 @@ async def transfer_files(self, drone, requested_files: dict, job_repr=None): # decision if a jobs inputfiles are cached based on hitrate random_inputfile_information = next(iter(requested_files.values())) if "hitrates" in random_inputfile_information.keys(): - hitrate = sum( - [ - file["usedsize"] * file["hitrates"].get(drone.sitename, 0.0) - for file in requested_files.values() - ] - ) / sum([file["usedsize"] for file in requested_files.values()]) - provides_file = int(random.random() < hitrate) + try: + hitrate = sum( + [ + file["usedsize"] * file["hitrates"].get(drone.sitename, 0.0) + for file in requested_files.values() + ] + ) / sum([file["usedsize"] for file in requested_files.values()]) + provides_file = int(random.random() < hitrate) + except ZeroDivisionError: + provides_file = 0 # print( # "{} on {} hitrate {} => {}".format( # requested_files, drone.sitename, hitrate, provides_file From de0a9e0fb9ca9885a64dcaad9e443a1e2bdb5a42 Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Wed, 12 Feb 2020 20:25:19 +0100 Subject: [PATCH 498/648] hot fix to avoid changing the walltime of jobs without inputfiles --- lapis/job.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/lapis/job.py b/lapis/job.py index c656d52..c8fde13 100644 --- a/lapis/job.py +++ b/lapis/job.py @@ -122,6 +122,8 @@ async def _calculate(self): # ) result = self.walltime try: + if not self.requested_inputfiles: + raise KeyError result = ( self.used_resources["cores"] / self.calculation_efficiency ) * self.walltime @@ -170,9 +172,9 @@ async def run(self, drone: "Drone"): # TODO: in_queue_until is still set raise else: - old_walltime = self.walltime + # old_walltime = self.walltime self.walltime = time.now - start - print(f"monitored walltime of {old_walltime} changed to {self.walltime}") + # print(f"monitored walltime of {old_walltime} changed to {self.walltime}") self.drone = None self._success = True await sampling_required.put(self) From f00d6e05ba06f362e463b308f44b97ef8313a382 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Fri, 31 Jan 2020 14:13:50 +0100 Subject: [PATCH 499/648] added base class for job scheduler --- lapis/scheduler.py | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/lapis/scheduler.py b/lapis/scheduler.py index 972657c..c2e155f 100644 --- a/lapis/scheduler.py +++ b/lapis/scheduler.py @@ -1,3 +1,4 @@ +from abc import ABC from typing import Dict, Iterator, Union, Tuple, List, TypeVar, Generic, Optional from weakref import WeakKeyDictionary @@ -72,7 +73,30 @@ class Bucket(List[Cluster[DJ]], Generic[DJ]): pass -class CondorJobScheduler(object): +class JobScheduler(ABC): + __slots__ = () + + @property + def drone_list(self) -> Iterator[Drone]: + raise NotImplementedError + + def register_drone(self, drone: Drone): + raise NotImplementedError + + def unregister_drone(self, drone: Drone): + raise NotImplementedError + + def update_drone(self, drone: Drone): + raise NotImplementedError + + async def run(self): + raise NotImplementedError + + async def job_finished(self, job): + raise NotImplementedError + + +class CondorJobScheduler(JobScheduler): """ Goal of the htcondor job scheduler is to have a scheduler that somehow mimics how htcondor does schedule jobs. From 54a27be74dabbfe7425bd453493840bdf8b5c4f4 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Fri, 31 Jan 2020 14:51:28 +0100 Subject: [PATCH 500/648] added older scheduler and renamed classad scheduler to CondorClassadJobScheduler --- lapis/scheduler.py | 155 ++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 154 insertions(+), 1 deletion(-) diff --git a/lapis/scheduler.py b/lapis/scheduler.py index c2e155f..a0e1824 100644 --- a/lapis/scheduler.py +++ b/lapis/scheduler.py @@ -96,7 +96,160 @@ async def job_finished(self, job): raise NotImplementedError -class CondorJobScheduler(JobScheduler): +class CondorJobScheduler(object): + """ + Goal of the htcondor job scheduler is to have a scheduler that somehow + mimics how htcondor does schedule jobs. + Htcondor does scheduling based on a priority queue. The priorities itself + are managed by operators of htcondor. + So different instances can apparently behave very different. + In my case I am going to try building a priority queue that sorts job slots + by increasing cost. The cost itself is calculated based on the current + strategy that is used at GridKa. The scheduler checks if a job either + exactly fits a slot or if it does fit into it several times. The cost for + putting a job at a given slot is given by the amount of resources that + might remain unallocated. + :return: + """ + + def __init__(self, job_queue): + self._stream_queue = job_queue + self.drone_cluster = [] + self.interval = 60 + self.job_queue = JobQueue() + self._collecting = True + self._processing = Resources(jobs=0) + + @property + def drone_list(self): + for cluster in self.drone_cluster: + for drone in cluster: + yield drone + + def register_drone(self, drone: Drone): + self._add_drone(drone) + + def unregister_drone(self, drone: Drone): + for cluster in self.drone_cluster: + try: + cluster.remove(drone) + except ValueError: + pass + else: + if len(cluster) == 0: + self.drone_cluster.remove(cluster) + + def _add_drone(self, drone: Drone, drone_resources: Dict = None): + minimum_distance_cluster = None + distance = float("Inf") + if len(self.drone_cluster) > 0: + for cluster in self.drone_cluster: + current_distance = 0 + for key in {*cluster[0].pool_resources, *drone.pool_resources}: + if drone_resources: + current_distance += abs( + cluster[0].theoretical_available_resources.get(key, 0) + - drone_resources.get(key, 0) + ) + else: + current_distance += abs( + cluster[0].theoretical_available_resources.get(key, 0) + - drone.theoretical_available_resources.get(key, 0) + ) + if current_distance < distance: + minimum_distance_cluster = cluster + distance = current_distance + if distance < 1: + minimum_distance_cluster.append(drone) + else: + self.drone_cluster.append([drone]) + else: + self.drone_cluster.append([drone]) + + def update_drone(self, drone: Drone): + self.unregister_drone(drone) + self._add_drone(drone) + + async def run(self): + async with Scope() as scope: + scope.do(self._collect_jobs()) + async for _ in interval(self.interval): + for job in self.job_queue.copy(): + best_match = self._schedule_job(job) + if best_match: + await best_match.schedule_job(job) + self.job_queue.remove(job) + await sampling_required.put(self.job_queue) + self.unregister_drone(best_match) + left_resources = best_match.theoretical_available_resources + left_resources = { + key: value - job.resources.get(key, 0) + for key, value in left_resources.items() + } + self._add_drone(best_match, left_resources) + if ( + not self._collecting + and not self.job_queue + and self._processing.levels.jobs == 0 + ): + break + await sampling_required.put(self) + + async def _collect_jobs(self): + async for job in self._stream_queue: + self.job_queue.append(job) + await self._processing.increase(jobs=1) + # TODO: logging happens with each job + await sampling_required.put(self.job_queue) + self._collecting = False + + async def job_finished(self, job): + if job.successful: + await self._processing.decrease(jobs=1) + else: + await self._stream_queue.put(job) + + def _schedule_job(self, job) -> Drone: + priorities = {} + for cluster in self.drone_cluster: + drone = cluster[0] + cost = 0 + resources = drone.theoretical_available_resources + for resource_type in job.resources: + if resources.get(resource_type, 0) < job.resources[resource_type]: + # Inf for all job resources that a drone does not support + # and all resources that are too small to even be considered + cost = float("Inf") + break + else: + try: + cost += 1 / ( + resources[resource_type] // job.resources[resource_type] + ) + except KeyError: + pass + for additional_resource_type in [ + key for key in drone.pool_resources if key not in job.resources + ]: + cost += resources[additional_resource_type] + cost /= len((*job.resources, *drone.pool_resources)) + if cost <= 1: + # directly start job + return drone + try: + priorities[cost].append(drone) + except KeyError: + priorities[cost] = [drone] + try: + minimal_key = min(priorities) + if minimal_key < float("Inf"): + return priorities[minimal_key][0] + except ValueError: + pass + return None + + +class CondorClassadJobScheduler(JobScheduler): """ Goal of the htcondor job scheduler is to have a scheduler that somehow mimics how htcondor does schedule jobs. From 846fe558070138e2090f1b3c36626e785fee31b1 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Fri, 31 Jan 2020 15:02:29 +0100 Subject: [PATCH 501/648] CondorJobScheduler now implements interface of JobScheduler --- lapis/scheduler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapis/scheduler.py b/lapis/scheduler.py index a0e1824..9c36a9d 100644 --- a/lapis/scheduler.py +++ b/lapis/scheduler.py @@ -96,7 +96,7 @@ async def job_finished(self, job): raise NotImplementedError -class CondorJobScheduler(object): +class CondorJobScheduler(JobScheduler): """ Goal of the htcondor job scheduler is to have a scheduler that somehow mimics how htcondor does schedule jobs. From c5f01b674cb6ebef39983198b356158e0357eeca Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Fri, 31 Jan 2020 16:05:02 +0100 Subject: [PATCH 502/648] added possibility to save temporary resources at drone wrapper --- lapis/scheduler.py | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/lapis/scheduler.py b/lapis/scheduler.py index 9c36a9d..9544446 100644 --- a/lapis/scheduler.py +++ b/lapis/scheduler.py @@ -28,12 +28,13 @@ class JobQueue(list): class WrappedClassAd(ClassAd, Generic[DJ]): - __slots__ = "_wrapped" + __slots__ = "_wrapped", "_temp" def __init__(self, classad: ClassAd, wrapped: DJ): super(WrappedClassAd, self).__init__() self._wrapped = wrapped self._data = classad._data + self._temp = {} def __getitem__(self, item): def access_wrapped(name, requested=True): @@ -51,13 +52,25 @@ def access_wrapped(name, requested=True): elif "requestdisk" in item: return (1 / 1024) * access_wrapped("disk", requested=True) elif "cpus" in item: - return access_wrapped("cores", requested=False) + try: + return self._temp["cores"] + except KeyError: + return access_wrapped("cores", requested=False) elif "memory" in item: - return (1 / 1000 / 1000) * access_wrapped("memory", requested=False) + try: + return self._temp["memory"] + except KeyError: + return (1 / 1000 / 1000) * access_wrapped("memory", requested=False) elif "disk" in item: - return (1 / 1024) * access_wrapped("disk", requested=False) + try: + return self._temp["disk"] + except KeyError: + return (1 / 1024) * access_wrapped("disk", requested=False) return super(WrappedClassAd, self).__getitem__(item) + def clear_temporary_resources(self): + self._temp.clear() + def __repr__(self): return f"<{self.__class__.__name__}>: {self._wrapped}" From bacaf0ce0d46b9b36695e18ac37a186c0bb2c795 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Fri, 31 Jan 2020 16:50:42 +0100 Subject: [PATCH 503/648] added docstrings to scheduler base class --- lapis/scheduler.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/lapis/scheduler.py b/lapis/scheduler.py index 9544446..f12cc6d 100644 --- a/lapis/scheduler.py +++ b/lapis/scheduler.py @@ -91,21 +91,31 @@ class JobScheduler(ABC): @property def drone_list(self) -> Iterator[Drone]: + """Yields the registered drones""" raise NotImplementedError def register_drone(self, drone: Drone): + """Register a drone at the scheduler""" raise NotImplementedError def unregister_drone(self, drone: Drone): + """Unregister a drone at the scheduler""" raise NotImplementedError def update_drone(self, drone: Drone): + """Update parameters of a drone""" raise NotImplementedError async def run(self): + """Run method of the scheduler""" raise NotImplementedError async def job_finished(self, job): + """ + Declare a job as finished by a drone. This might even mean, that the job + has failed and that the scheduler needs to requeue the job for further + processing. + """ raise NotImplementedError From 72a4be12ff326d0e26d420a288d2516f2b2d09a3 Mon Sep 17 00:00:00 2001 From: Max Fischer Date: Wed, 12 Feb 2020 12:42:35 +0100 Subject: [PATCH 504/648] condor classad scheduler draft --- lapis/scheduler.py | 308 ++++++++++++++++++++++++++------------------- 1 file changed, 177 insertions(+), 131 deletions(-) diff --git a/lapis/scheduler.py b/lapis/scheduler.py index f12cc6d..0552994 100644 --- a/lapis/scheduler.py +++ b/lapis/scheduler.py @@ -1,8 +1,11 @@ from abc import ABC -from typing import Dict, Iterator, Union, Tuple, List, TypeVar, Generic, Optional +from typing import Dict, Iterator, Tuple, List, TypeVar, Generic, Set, NamedTuple from weakref import WeakKeyDictionary +from sortedcontainers import SortedDict + from classad import parse +from classad._base_expression import Expression from classad._functions import quantize from classad._primitives import HTCInt, Undefined from classad._expression import ClassAd @@ -23,6 +26,15 @@ class JobQueue(list): "cores": HTCInt(1), } +machine_ad_defaults = """ +requirements = target.requestcpus <= my.cpus +""".strip() + +job_ad_defaults = """ +requirements = my.requestcpus <= target.cpus && my.requestmemory <= target.memory +""" + +T = TypeVar("T") DJ = TypeVar("DJ", Drone, Job) @@ -272,6 +284,89 @@ def _schedule_job(self, job) -> Drone: return None +# HTCondor ClassAd Scheduler + + +class NoMatch(Exception): + """A job could not be matched to any drone""" + + +class RankedClusterKey(NamedTuple): + rank: float + key: Tuple[float, ...] + + +class RankedAutoClusters(Generic[DJ]): + """Automatically cluster similar jobs or drones""" + + def __init__(self, quantization: Dict[str, HTCInt], ranking: Expression): + self._quantization = quantization + self._ranking = ranking + self._clusters: Dict[RankedClusterKey, Set[WrappedClassAd[DJ]]] = SortedDict() + self._inverse: Dict[WrappedClassAd[DJ], RankedClusterKey] = {} + + def copy(self) -> "RankedAutoClusters[DJ]": + """Copy the entire ranked auto clusters""" + clone = type(self)(quantization=self._quantization, ranking=self._ranking) + clone._clusters = SortedDict( + (key, value.copy()) for key, value in self._clusters.items() + ) + clone._inverse = self._inverse.copy() + return clone + + def add(self, item: WrappedClassAd[DJ]): + """Add a new item""" + if item in self._inverse: + raise ValueError(f"{item!r} already stored; use `.update(item)` instead") + item_key = self._clustering_key(item) + try: + self._clusters[item_key].add(item) + except KeyError: + self._clusters[item_key] = {item} + self._inverse[item] = item_key + + def remove(self, item: WrappedClassAd[DJ]): + """Remove an existing item""" + item_key = self._inverse.pop(item) + cluster = self._clusters[item_key] + cluster.remove(item) + if not cluster: + del self._clusters[item_key] + + def update(self, item): + """Update an existing item with its current state""" + self.remove(item) + self.add(item) + + def _clustering_key(self, item: WrappedClassAd[DJ]): + # TODO: assert that order is consistent + quantization = self._quantization + return RankedClusterKey( + rank=self._ranking.evaluate(my=item), + key=tuple( + int(quantize(value, quantization.get(key, 1))) + for key, value in item._wrapped.available_resources.items() + ), + ) + + def clusters(self) -> Iterator[Set[WrappedClassAd[DJ]]]: + return iter(self._clusters.values()) + + def items(self) -> Iterator[Tuple[RankedClusterKey, Set[WrappedClassAd[DJ]]]]: + return iter(self._clusters.items()) + + def cluster_groups(self) -> Iterator[List[Set[WrappedClassAd[Drone]]]]: + """Group autoclusters by PreJobRank""" + group = [] + current_rank = 0 + for ranked_key, drones in self._clusters.items(): + if ranked_key.rank != current_rank and group: + current_rank = ranked_key.rank + yield group + group = [] + group.append(drones) + + class CondorClassadJobScheduler(JobScheduler): """ Goal of the htcondor job scheduler is to have a scheduler that somehow @@ -288,175 +383,130 @@ class CondorClassadJobScheduler(JobScheduler): :return: """ - def __init__(self, job_queue): + def __init__( + self, + job_queue, + machine_ad: str = machine_ad_defaults, + job_ad: str = job_ad_defaults, + pre_job_rank: str = "0", + interval: float = 60, + ): self._stream_queue = job_queue - self.drone_cluster: Dict[Tuple[float, ...], Cluster[WrappedClassAd[Drone]]] = {} - self.job_cluster: Dict[Tuple[float, ...], Cluster[WrappedClassAd[Job]]] = {} - self.interval = 60 + self._drones: RankedAutoClusters[Drone] = RankedAutoClusters( + quantization=quantization_defaults, ranking=parse(pre_job_rank) + ) + self.interval = interval self.job_queue = JobQueue() self._collecting = True self._processing = Resources(jobs=0) # temporary solution self._wrapped_classads = WeakKeyDictionary() - self._machine_classad = parse( - """ - requirements = target.requestcpus <= my.cpus - pre_job_rank = 1 - rank = 0 - - """ - ) - self._job_classad = parse( - """ - requirements = my.requestcpus <= target.cpus && my.requestmemory <= target.memory - """ - ) + self._machine_classad = parse(machine_ad) + self._job_classad = parse(job_ad) @property def drone_list(self) -> Iterator[Drone]: - for cluster in self.drone_cluster.values(): + for cluster in self._drones.clusters(): for drone in cluster: yield drone._wrapped def register_drone(self, drone: Drone): wrapped_drone = WrappedClassAd(classad=self._machine_classad, wrapped=drone) self._wrapped_classads[drone] = wrapped_drone - self._add_drone(wrapped_drone) + self._drones.add(wrapped_drone) def unregister_drone(self, drone: Drone): drone_wrapper = self._wrapped_classads[drone] - for key in self.drone_cluster: - try: - self.drone_cluster[key].remove(drone_wrapper) - except ValueError: - pass - else: - break - else: - # nothing was removed - return - if len(self.drone_cluster[key]) == 0: - del self.drone_cluster[key] - - @staticmethod - def _clustering_key(resource_dict: Dict): - clustering_key = [] - for key, value in resource_dict.items(): - clustering_key.append( - int(quantize(value, quantization_defaults.get(key, 1))) - ) - return tuple(clustering_key) - - def _add_drone(self, drone: WrappedClassAd, drone_resources: Dict = None): - wrapped_drone = drone._wrapped - if drone_resources: - clustering_key = self._clustering_key(drone_resources) - else: - clustering_key = self._clustering_key(wrapped_drone.available_resources) - self.drone_cluster.setdefault(clustering_key, Cluster()).append(drone) + self._drones.remove(drone_wrapper) def update_drone(self, drone: Drone): - self.unregister_drone(drone) - self._add_drone(self._wrapped_classads[drone]) - - def _sort_drone_cluster(self): - return [Bucket(self.drone_cluster.values())] - - def _sort_job_cluster(self): - return Bucket(self.job_cluster.values()) + drone_wrapper = self._wrapped_classads[drone] + self._drones.update(drone_wrapper) async def run(self): - def filter_drones(job: WrappedClassAd[Job], drone_bucket: Bucket[Drone]): - result = {} # type: Dict[Union[Undefined, float], Bucket[Drone]] - for drones in drone_bucket: - drone = drones[0] # type: WrappedClassAd[Drone] - if job.evaluate( - "requirements", my=job, target=drone - ) and drone.evaluate("requirements", my=drone, target=job): - rank = drone.evaluate("rank", my=job, target=drone) - result.setdefault(rank, Bucket()).append(drones) - return result - - def pop_first( - ranked_drones: Dict[Union[Undefined, float], Bucket[Drone]] - ) -> Optional[WrappedClassAd[Drone]]: - if not ranked_drones: - return None - # print(ranked_drones) - key = sorted(ranked_drones)[0] - values = ranked_drones[key] - # print(key, values) - result = values[0] - values.remove(result) - if not values: - del ranked_drones[key] - try: - return result[0] - except IndexError: - return pop_first(ranked_drones) - async with Scope() as scope: scope.do(self._collect_jobs()) async for _ in interval(self.interval): - - # TODO: get sorted job cluster [{Job, ...}, ...] - # TODO: get set of drone cluster {{PSlot, ...}, ...} - # TODO: get sorted drone clusters PreJob [{{PSlot, ...}, ...}, ...] - # TODO: filter (Job.Requirements) and sort (Job.Rank) for job and drones => lazy - - all_drone_buckets = self._sort_drone_cluster() - filtered_drones = {} - for jobs in self._sort_job_cluster().copy(): - current_drone_bucket = 0 - for job in jobs: - best_match = pop_first(filtered_drones) - while best_match is None: - # lazily evaluate more PSlots - try: - # TODO: sort filtered_drones - filtered_drones = filter_drones( - job, all_drone_buckets[current_drone_bucket] - ) - except IndexError: - break - current_drone_bucket += 1 - best_match = pop_first(filtered_drones) - else: - # TODO: update drone and check if it gets reinserted to filtered_drones - await self._execute_job(job=job, drone=best_match) + await self._schedule_jobs() if ( not self._collecting and not self.job_queue and self._processing.levels.jobs == 0 ): break - await sampling_required.put(self) + + @staticmethod + def _match_job( + job: ClassAd, pre_job_clusters: Iterator[List[Set[WrappedClassAd[Drone]]]] + ): + if job["Requirements"] != Undefined: + pre_job_clusters = ( + [ + cluster + for cluster in cluster_group + if job.evaluate("Requirements", my=job, target=next(iter(cluster))) + ] + for cluster_group in pre_job_clusters + ) + if job["Rank"] != Undefined: + pre_job_clusters = ( + sorted( + cluster_group, + key=lambda cluster: job.evaluate( + "Rank", my=job, target=next(iter(cluster)) + ), + ) + for cluster_group in pre_job_clusters + ) + for cluster_group in pre_job_clusters: + # TODO: if we have POST_JOB_RANK, collect *all* matches of a group + for cluster in cluster_group: + for drone in cluster: + if drone["Requirements"] == Undefined or drone.evaluate( + "Requirements", my=drone, target=job + ): + return drone + raise NoMatch() + + async def _schedule_jobs(self): + # Pre Job Rank is the same for all jobs + # Use a copy to allow temporary "remainder after match" estimates + pre_job_drones = self._drones.copy() + matches: List[Tuple[int, WrappedClassAd[Job], WrappedClassAd[Drone]]] = [] + for queue_index, candidate_job in enumerate(self.job_queue): + try: + matched_drone = self._match_job( + candidate_job, pre_job_drones.cluster_groups() + ) + except NoMatch: + continue + else: + matches.append((queue_index, candidate_job, matched_drone)) + # TODO: deduct job-resources from matched drone + # and update instead of remove + pre_job_drones.remove(matched_drone) + if not matches: + return + # TODO: optimize for few matches, many matches, all matches + for queue_index, _, _ in reversed(matches): + del self.job_queue[queue_index] + for _, job, drone in matches: + await self._execute_job(job=job, drone=drone) + await sampling_required.put(self) + # NOTE: Is this correct? Triggers once instead of for each job + await sampling_required.put(self.job_queue) async def _execute_job(self, job: WrappedClassAd, drone: WrappedClassAd): wrapped_job = job._wrapped wrapped_drone = drone._wrapped await wrapped_drone.schedule_job(wrapped_job) - self.job_queue.remove(job) - cluster_key = self._clustering_key(wrapped_job.resources) - self.job_cluster[cluster_key].remove(job) - if len(self.job_cluster[cluster_key]) == 0: - del self.job_cluster[cluster_key] - await sampling_required.put(self.job_queue) - self.unregister_drone(wrapped_drone) - left_resources = { - key: value - wrapped_job.resources.get(key, 0) - for key, value in wrapped_drone.theoretical_available_resources.items() - } - self._add_drone(drone, left_resources) async def _collect_jobs(self): async for job in self._stream_queue: wrapped_job = WrappedClassAd(classad=self._job_classad, wrapped=job) self._wrapped_classads[job] = wrapped_job self.job_queue.append(wrapped_job) - cluster_key = self._clustering_key(job.resources) - self.job_cluster.setdefault(cluster_key, []).append(wrapped_job) await self._processing.increase(jobs=1) # TODO: logging happens with each job # TODO: job queue to the outside now contains wrapped classads... @@ -468,7 +518,3 @@ async def job_finished(self, job): await self._processing.decrease(jobs=1) else: self.job_queue.append(self._wrapped_classads[job]) - cluster_key = self._clustering_key(job.resources) - self.job_cluster.setdefault(cluster_key, []).append( - self._wrapped_classads[job] - ) From 1d3f5eb777231b36d427ac203780122e915d0215 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Wed, 12 Feb 2020 15:24:51 +0100 Subject: [PATCH 505/648] added hash for wrappedclassad --- lapis/scheduler.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/lapis/scheduler.py b/lapis/scheduler.py index 0552994..d7170d9 100644 --- a/lapis/scheduler.py +++ b/lapis/scheduler.py @@ -89,6 +89,9 @@ def __repr__(self): def __eq__(self, other): return super().__eq__(other) and self._wrapped == other._wrapped + def __hash__(self): + return id(self._wrapped) + class Cluster(List[WrappedClassAd[DJ]], Generic[DJ]): pass From 737e6986de336093dd386f4102e3952e8baaeeb1 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Wed, 12 Feb 2020 16:48:06 +0100 Subject: [PATCH 506/648] made scheduler working --- lapis/scheduler.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/lapis/scheduler.py b/lapis/scheduler.py index d7170d9..cc8b8c1 100644 --- a/lapis/scheduler.py +++ b/lapis/scheduler.py @@ -361,13 +361,16 @@ def items(self) -> Iterator[Tuple[RankedClusterKey, Set[WrappedClassAd[DJ]]]]: def cluster_groups(self) -> Iterator[List[Set[WrappedClassAd[Drone]]]]: """Group autoclusters by PreJobRank""" group = [] - current_rank = 0 + current_rank = None for ranked_key, drones in self._clusters.items(): - if ranked_key.rank != current_rank and group: + if ranked_key.rank != current_rank: current_rank = ranked_key.rank - yield group - group = [] + if group: + yield group + group = [] group.append(drones) + if group: + yield group class CondorClassadJobScheduler(JobScheduler): @@ -443,7 +446,7 @@ async def run(self): def _match_job( job: ClassAd, pre_job_clusters: Iterator[List[Set[WrappedClassAd[Drone]]]] ): - if job["Requirements"] != Undefined: + if job["Requirements"] != Undefined(): pre_job_clusters = ( [ cluster @@ -452,7 +455,7 @@ def _match_job( ] for cluster_group in pre_job_clusters ) - if job["Rank"] != Undefined: + if job["Rank"] != Undefined(): pre_job_clusters = ( sorted( cluster_group, @@ -466,7 +469,7 @@ def _match_job( # TODO: if we have POST_JOB_RANK, collect *all* matches of a group for cluster in cluster_group: for drone in cluster: - if drone["Requirements"] == Undefined or drone.evaluate( + if drone["Requirements"] == Undefined() or drone.evaluate( "Requirements", my=drone, target=job ): return drone From b88814476e58cfaf17ad346ea64b40b455ce1316 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Wed, 12 Feb 2020 17:35:38 +0100 Subject: [PATCH 507/648] updating of available resources in drones, closes #82 --- lapis/scheduler.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/lapis/scheduler.py b/lapis/scheduler.py index cc8b8c1..5bf0aa6 100644 --- a/lapis/scheduler.py +++ b/lapis/scheduler.py @@ -70,12 +70,12 @@ def access_wrapped(name, requested=True): return access_wrapped("cores", requested=False) elif "memory" in item: try: - return self._temp["memory"] + return (1 / 1000 / 1000) * self._temp["memory"] except KeyError: return (1 / 1000 / 1000) * access_wrapped("memory", requested=False) elif "disk" in item: try: - return self._temp["disk"] + return (1 / 1024) * self._temp["disk"] except KeyError: return (1 / 1024) * access_wrapped("disk", requested=False) return super(WrappedClassAd, self).__getitem__(item) @@ -489,15 +489,22 @@ async def _schedule_jobs(self): continue else: matches.append((queue_index, candidate_job, matched_drone)) - # TODO: deduct job-resources from matched drone - # and update instead of remove - pre_job_drones.remove(matched_drone) + for key, value in candidate_job._wrapped.resources.items(): + matched_drone._temp[key] = ( + matched_drone._temp.get( + key, + matched_drone._wrapped.theoretical_available_resources[key], + ) + - value + ) + pre_job_drones.update(matched_drone) if not matches: return # TODO: optimize for few matches, many matches, all matches for queue_index, _, _ in reversed(matches): del self.job_queue[queue_index] for _, job, drone in matches: + drone.clear_temporary_resources() await self._execute_job(job=job, drone=drone) await sampling_required.put(self) # NOTE: Is this correct? Triggers once instead of for each job From bd1677c266f7ab1215017a264f349e1601bcc484 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Wed, 12 Feb 2020 17:42:07 +0100 Subject: [PATCH 508/648] gardening --- lapis/scheduler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapis/scheduler.py b/lapis/scheduler.py index 5bf0aa6..0cc6000 100644 --- a/lapis/scheduler.py +++ b/lapis/scheduler.py @@ -489,7 +489,7 @@ async def _schedule_jobs(self): continue else: matches.append((queue_index, candidate_job, matched_drone)) - for key, value in candidate_job._wrapped.resources.items(): + for key, value in enumerate(candidate_job._wrapped.resources): matched_drone._temp[key] = ( matched_drone._temp.get( key, From 8982f68430acca21979911358c2bcd1c717ef5bb Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Thu, 13 Feb 2020 15:17:09 +0100 Subject: [PATCH 509/648] added custom scheduler --- custom_simulate.py | 25 ++++++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/custom_simulate.py b/custom_simulate.py index 4ffade7..dc21ee8 100644 --- a/custom_simulate.py +++ b/custom_simulate.py @@ -15,7 +15,7 @@ storage_reader_filebased_hitrate_caching, ) -from lapis.scheduler import CondorJobScheduler +from lapis.scheduler import CondorClassadJobScheduler from lapis.simulator import Simulator @@ -23,6 +23,16 @@ from time import time +machine_ad_defaults = """ + requirements = target.requestcpus <= my.cpus + rank = 0 + """.strip() + +job_ad_defaults = """ +requirements = my.requestcpus <= target.cpus && my.requestmemory <= target.memory +rank = 1 +""" +pre_job_rank_defaults = "0" last_step = 0 @@ -47,6 +57,7 @@ def ini_and_run( until=None, calculation_efficiency=1.0, log_telegraf=False, + pre_job_rank=pre_job_rank_defaults, ): # ini logging to file monitoring_logger = logging.getLogger() @@ -77,7 +88,13 @@ def ini_and_run( job_import_mapper[file_type], calculation_efficiency=calculation_efficiency ), ) - simulator.create_scheduler(scheduler_type=CondorJobScheduler) + + simulator.job_scheduler = CondorClassadJobScheduler( + job_queue=simulator.job_queue, + pre_job_rank=pre_job_rank, + machine_ad=machine_ad_defaults, + job_ad=job_ad_defaults, + ) simulator.create_connection_module(remote_throughput * 1024 * 1024 * 1024) with open(storage_file, "r") as storage_file: @@ -126,6 +143,8 @@ def ini_and_run( pool_files=pool_files, storage_file=storage_file, storage_type=storage_type, - log_file="minimal_hitratebased_test.log", + log_file="test_new_scheduler.log", log_telegraf=True, + # pre_job_rank="100000 * my.cpus + my.memory - 1000000 - 10000000 * my.rank " + pre_job_rank="1", ) From f6e28cc6249851eb4d014961d72af566b0c0d12f Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Thu, 13 Feb 2020 12:24:23 +0100 Subject: [PATCH 510/648] reversed gardening, sorry --- lapis/scheduler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapis/scheduler.py b/lapis/scheduler.py index 0cc6000..5bf0aa6 100644 --- a/lapis/scheduler.py +++ b/lapis/scheduler.py @@ -489,7 +489,7 @@ async def _schedule_jobs(self): continue else: matches.append((queue_index, candidate_job, matched_drone)) - for key, value in enumerate(candidate_job._wrapped.resources): + for key, value in candidate_job._wrapped.resources.items(): matched_drone._temp[key] = ( matched_drone._temp.get( key, From 0b7af85849cc5d8c31df3540afa39e14f876acf1 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Thu, 13 Feb 2020 14:54:18 +0100 Subject: [PATCH 511/648] fixed calculation of clustering key and reversed pre_job_cluster --- lapis/scheduler.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/lapis/scheduler.py b/lapis/scheduler.py index 5bf0aa6..027c5a2 100644 --- a/lapis/scheduler.py +++ b/lapis/scheduler.py @@ -345,10 +345,10 @@ def _clustering_key(self, item: WrappedClassAd[DJ]): # TODO: assert that order is consistent quantization = self._quantization return RankedClusterKey( - rank=self._ranking.evaluate(my=item), + rank=-self._ranking.evaluate(my=item), key=tuple( - int(quantize(value, quantization.get(key, 1))) - for key, value in item._wrapped.available_resources.items() + int(quantize(item[key], quantization.get(key, 1))) + for key in ("cpus", "memory", "disk") ), ) @@ -462,6 +462,7 @@ def _match_job( key=lambda cluster: job.evaluate( "Rank", my=job, target=next(iter(cluster)) ), + reverse=True, ) for cluster_group in pre_job_clusters ) From 824ffd666cf0df8b2ff8c60795d68e2098e5e527 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Thu, 13 Feb 2020 15:06:05 +0100 Subject: [PATCH 512/648] shuffling cluster group to remove bias --- lapis/scheduler.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lapis/scheduler.py b/lapis/scheduler.py index 027c5a2..34d3642 100644 --- a/lapis/scheduler.py +++ b/lapis/scheduler.py @@ -1,3 +1,4 @@ +import random from abc import ABC from typing import Dict, Iterator, Tuple, List, TypeVar, Generic, Set, NamedTuple from weakref import WeakKeyDictionary @@ -468,6 +469,7 @@ def _match_job( ) for cluster_group in pre_job_clusters: # TODO: if we have POST_JOB_RANK, collect *all* matches of a group + random.shuffle(cluster_group) # shuffle cluster to remove bias towards cpus for cluster in cluster_group: for drone in cluster: if drone["Requirements"] == Undefined() or drone.evaluate( From ab56eec4beac04b5b8ccb255563ff5e3aa92ac86 Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Fri, 14 Feb 2020 14:52:57 +0100 Subject: [PATCH 513/648] changed log file format to LineProtocol for influx --- lapis/monitor/caching.py | 10 ++++++++-- lapis/monitor/cobald.py | 10 ++++++++-- lapis/monitor/general.py | 37 ++++++++++++++++++++++++++++++------- 3 files changed, 46 insertions(+), 11 deletions(-) diff --git a/lapis/monitor/caching.py b/lapis/monitor/caching.py index 598b12b..b19c9a0 100644 --- a/lapis/monitor/caching.py +++ b/lapis/monitor/caching.py @@ -38,7 +38,10 @@ def storage_status(storage: StorageElement) -> list: storage_status.whitelist = (StorageElement,) storage_status.logging_formatter = { LoggingSocketHandler.__name__: JsonFormatter(), - logging.StreamHandler.__name__: JsonFormatter(), + # logging.StreamHandler.__name__: JsonFormatter(), + logging.StreamHandler.__name__: LineProtocolFormatter( + tags={"tardis", "storage"}, resolution=1 + ), LoggingUDPSocketHandler.__name__: LineProtocolFormatter( tags={"tardis", "storage"}, resolution=1 ), @@ -67,7 +70,10 @@ def pipe_status(pipeinfo: MonitoredPipeInfo) -> list: pipe_status.whitelist = (MonitoredPipeInfo,) pipe_status.logging_formatter = { LoggingSocketHandler.__name__: JsonFormatter(), - logging.StreamHandler.__name__: JsonFormatter(), + # logging.StreamHandler.__name__: JsonFormatter(), + logging.StreamHandler.__name__: LineProtocolFormatter( + tags={"tardis", "pipe"}, resolution=1 + ), LoggingUDPSocketHandler.__name__: LineProtocolFormatter( tags={"tardis", "pipe"}, resolution=1 ), diff --git a/lapis/monitor/cobald.py b/lapis/monitor/cobald.py index 21dfee0..710ee0c 100644 --- a/lapis/monitor/cobald.py +++ b/lapis/monitor/cobald.py @@ -35,7 +35,10 @@ def drone_statistics(drone: Drone) -> List[Dict]: drone_statistics.whitelist = (Drone,) drone_statistics.logging_formatter = { LoggingSocketHandler.__name__: JsonFormatter(), - logging.StreamHandler.__name__: JsonFormatter(), + # logging.StreamHandler.__name__: JsonFormatter(), + logging.StreamHandler.__name__: LineProtocolFormatter( + tags={"tardis", "pool_configuration", "pool_type", "pool"}, resolution=1 + ), LoggingUDPSocketHandler.__name__: LineProtocolFormatter( tags={"tardis", "pool_configuration", "pool_type", "pool"}, resolution=1 ), @@ -67,7 +70,10 @@ def pool_statistics(pool: Pool) -> List[Dict]: pool_statistics.whitelist = (Pool,) pool_statistics.logging_formatter = { LoggingSocketHandler.__name__: JsonFormatter(), - logging.StreamHandler.__name__: JsonFormatter(), + # logging.StreamHandler.__name__: JsonFormatter(), + logging.StreamHandler.__name__: LineProtocolFormatter( + tags={"tardis", "pool_configuration", "pool_type", "pool"}, resolution=1 + ), LoggingUDPSocketHandler.__name__: LineProtocolFormatter( tags={"tardis", "pool_configuration", "pool_type", "pool"}, resolution=1 ), diff --git a/lapis/monitor/general.py b/lapis/monitor/general.py index ab46b2b..8569e6d 100644 --- a/lapis/monitor/general.py +++ b/lapis/monitor/general.py @@ -58,7 +58,11 @@ def resource_statistics(drone: Drone) -> List[Dict]: resource_statistics.whitelist = (Drone,) resource_statistics.logging_formatter = { LoggingSocketHandler.__name__: JsonFormatter(), - logging.StreamHandler.__name__: JsonFormatter(), + # logging.StreamHandler.__name__: JsonFormatter(), + logging.StreamHandler.__name__: LineProtocolFormatter( + tags={"tardis", "resource_type", "pool_configuration", "pool_type", "pool"}, + resolution=1, + ), LoggingUDPSocketHandler.__name__: LineProtocolFormatter( tags={"tardis", "resource_type", "pool_configuration", "pool_type", "pool"}, resolution=1, @@ -81,7 +85,10 @@ def user_demand(job_queue: JobQueue) -> List[Dict]: user_demand.whitelist = (JobQueue,) user_demand.logging_formatter = { LoggingSocketHandler.__name__: JsonFormatter(), - logging.StreamHandler.__name__: JsonFormatter(), + # logging.StreamHandler.__name__: JsonFormatter(), + logging.StreamHandler.__name__: LineProtocolFormatter( + tags={"tardis"}, resolution=1 + ), LoggingUDPSocketHandler.__name__: LineProtocolFormatter( tags={"tardis"}, resolution=1 ), @@ -118,7 +125,10 @@ def job_statistics(scheduler: CondorJobScheduler) -> List[Dict]: job_statistics.whitelist = (CondorJobScheduler,) job_statistics.logging_formatter = { LoggingSocketHandler.__name__: JsonFormatter(), - logging.StreamHandler.__name__: JsonFormatter(), + # logging.StreamHandler.__name__: JsonFormatter(), + logging.StreamHandler.__name__: LineProtocolFormatter( + tags={"tardis", "pool_configuration", "pool_type", "pool"}, resolution=1 + ), LoggingUDPSocketHandler.__name__: LineProtocolFormatter( tags={"tardis", "pool_configuration", "pool_type", "pool"}, resolution=1 ), @@ -181,7 +191,10 @@ def job_events(job: Job) -> List[Dict]: job_events.whitelist = (Job,) job_events.logging_formatter = { LoggingSocketHandler.__name__: JsonFormatter(), - logging.StreamHandler.__name__: JsonFormatter(), + # logging.StreamHandler.__name__: JsonFormatter(), + logging.StreamHandler.__name__: LineProtocolFormatter( + tags={"tardis", "pool_configuration", "pool_type", "pool", "job"}, resolution=1 + ), LoggingUDPSocketHandler.__name__: LineProtocolFormatter( tags={"tardis", "pool_configuration", "pool_type", "pool", "job"}, resolution=1 ), @@ -202,7 +215,11 @@ def pool_status(pool: Pool) -> List[Dict]: pool_status.whitelist = (Pool,) pool_status.logging_formatter = { LoggingSocketHandler.__name__: JsonFormatter(), - logging.StreamHandler.__name__: JsonFormatter(), + # logging.StreamHandler.__name__: JsonFormatter(), + logging.StreamHandler.__name__: LineProtocolFormatter( + tags={"tardis", "parent_pool", "pool_configuration", "pool_type", "pool"}, + resolution=1, + ), LoggingUDPSocketHandler.__name__: LineProtocolFormatter( tags={"tardis", "parent_pool", "pool_configuration", "pool_type", "pool"}, resolution=1, @@ -223,7 +240,10 @@ def configuration_information(simulator: "Simulator") -> List[Dict]: configuration_information.name = "configuration" configuration_information.logging_formatter = { LoggingSocketHandler.__name__: JsonFormatter(), - logging.StreamHandler.__name__: JsonFormatter(), + # logging.StreamHandler.__name__: JsonFormatter(), + logging.StreamHandler.__name__: LineProtocolFormatter( + tags={"tardis", "pool_configuration", "resource_type"}, resolution=1 + ), LoggingUDPSocketHandler.__name__: LineProtocolFormatter( tags={"tardis", "pool_configuration", "resource_type"}, resolution=1 ), @@ -255,7 +275,10 @@ def drone_statistics_caching(drone: Drone) -> List[Dict]: drone_statistics_caching.whitelist = (Drone,) drone_statistics_caching.logging_formatter = { LoggingSocketHandler.__name__: JsonFormatter(), - logging.StreamHandler.__name__: JsonFormatter(), + # logging.StreamHandler.__name__: JsonFormatter(), + logging.StreamHandler.__name__: LineProtocolFormatter( + tags={"tardis", "pool_type", "pool"}, resolution=1 + ), LoggingUDPSocketHandler.__name__: LineProtocolFormatter( tags={"tardis", "pool_type", "pool"}, resolution=1 ), From fcfac62c35dc2f437c63fb3621dea3974edb381a Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Fri, 14 Feb 2020 15:07:13 +0100 Subject: [PATCH 514/648] updated custom_simulate_batchsystem.py --- custom_simulate_batchsystem.py | 26 ++++++++++++++++++++++---- 1 file changed, 22 insertions(+), 4 deletions(-) diff --git a/custom_simulate_batchsystem.py b/custom_simulate_batchsystem.py index fc98a56..2786919 100644 --- a/custom_simulate_batchsystem.py +++ b/custom_simulate_batchsystem.py @@ -15,13 +15,25 @@ storage_reader_filebased_hitrate_caching, ) -from lapis.scheduler import CondorJobScheduler +from lapis.scheduler import CondorClassadJobScheduler from lapis.simulator import Simulator import sys from lapis.monitor import LoggingUDPSocketHandler, SimulationTimeFilter +# from time import time + +machine_ad_defaults = """ + requirements = target.requestcpus <= my.cpus + rank = 0 + """.strip() + +job_ad_defaults = """ +requirements = my.requestcpus <= target.cpus && my.requestmemory <= target.memory +rank = 1 +""" +pre_job_rank_defaults = "0" last_step = 0 @@ -46,6 +58,7 @@ def ini_and_run( until=None, calculation_efficiency=1.0, log_telegraf=False, + pre_job_rank=pre_job_rank_defaults, ): # ini logging to file monitoring_logger = logging.getLogger() @@ -68,15 +81,20 @@ def ini_and_run( simulator = Simulator(seed=seed) file_type = "htcondor" file = job_file - # print() - # input() + simulator.create_job_generator( job_input=open(file, "r"), job_reader=partial( job_import_mapper[file_type], calculation_efficiency=calculation_efficiency ), ) - simulator.create_scheduler(scheduler_type=CondorJobScheduler) + + simulator.job_scheduler = CondorClassadJobScheduler( + job_queue=simulator.job_queue, + pre_job_rank=pre_job_rank, + machine_ad=machine_ad_defaults, + job_ad=job_ad_defaults, + ) simulator.create_connection_module(remote_throughput * 1024 * 1024 * 1024) with open(storage_file, "r") as storage_file: From 88c8e34b10d951d50b15553562d43f8a91150e40 Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Sat, 15 Feb 2020 09:09:13 +0100 Subject: [PATCH 515/648] added debug output in drone --- lapis/drone.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lapis/drone.py b/lapis/drone.py index c3e07d7..80b2611 100644 --- a/lapis/drone.py +++ b/lapis/drone.py @@ -53,6 +53,8 @@ def __init__( # caching-related self.jobs_using_caching = 0 + print(repr(self), "pool resources: ", self.pool_resources) + @property def theoretical_available_resources(self): return dict(self.resources.levels) From 704d2f55f3c352aad6f8bdb70e17f1c4da3b6717 Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Sat, 15 Feb 2020 09:37:25 +0100 Subject: [PATCH 516/648] temporarily removed job event statistics for debugging purposes --- lapis/drone.py | 2 -- lapis/simulator.py | 4 ++-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/lapis/drone.py b/lapis/drone.py index 80b2611..c3e07d7 100644 --- a/lapis/drone.py +++ b/lapis/drone.py @@ -53,8 +53,6 @@ def __init__( # caching-related self.jobs_using_caching = 0 - print(repr(self), "pool resources: ", self.pool_resources) - @property def theoretical_available_resources(self): return dict(self.resources.levels) diff --git a/lapis/simulator.py b/lapis/simulator.py index b472811..7ab78db 100644 --- a/lapis/simulator.py +++ b/lapis/simulator.py @@ -19,7 +19,7 @@ resource_statistics, pool_status, configuration_information, - job_events, + # job_events, drone_statistics_caching, ) from lapis.monitor.cobald import drone_statistics, pool_statistics @@ -44,7 +44,7 @@ def __init__(self, seed=1234): def enable_monitoring(self): self.monitoring.register_statistic(user_demand) self.monitoring.register_statistic(job_statistics) - self.monitoring.register_statistic(job_events) + # self.monitoring.register_statistic(job_events) self.monitoring.register_statistic(pool_statistics) self.monitoring.register_statistic(drone_statistics) self.monitoring.register_statistic(resource_statistics) From 2da843563993d746ad98c42890fa821ed1017a82 Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Sat, 15 Feb 2020 11:49:00 +0100 Subject: [PATCH 517/648] added debug output --- lapis/job.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lapis/job.py b/lapis/job.py index c8fde13..800f20c 100644 --- a/lapis/job.py +++ b/lapis/job.py @@ -163,10 +163,12 @@ async def run(self, drone: "Drone"): scope.do(self._transfer_inputfiles()) scope.do(self._calculate()) except CancelTask: + print("CancelTask") self.drone = None self._success = False # TODO: in_queue_until is still set except BaseException: + print("BaseException") self.drone = None self._success = False # TODO: in_queue_until is still set From 9985ad2af267e2a4a8eda216616e4910b464ffdd Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Sat, 15 Feb 2020 13:12:18 +0100 Subject: [PATCH 518/648] reactivating job event monitoring --- lapis/monitor/general.py | 3 +++ lapis/simulator.py | 4 ++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/lapis/monitor/general.py b/lapis/monitor/general.py index 8569e6d..42e3923 100644 --- a/lapis/monitor/general.py +++ b/lapis/monitor/general.py @@ -173,9 +173,12 @@ def job_events(job: Job) -> List[Dict]: result["success"] = 0 error_logged = False for resource_key in job.resources: + print(resource_key) usage = job.used_resources.get( resource_key, job.resources.get(resource_key, None) ) + print(usage, job.resources) + print(job.drone) value = usage / job.resources.get( resource_key, job.drone.pool_resources[resource_key] ) diff --git a/lapis/simulator.py b/lapis/simulator.py index 7ab78db..b472811 100644 --- a/lapis/simulator.py +++ b/lapis/simulator.py @@ -19,7 +19,7 @@ resource_statistics, pool_status, configuration_information, - # job_events, + job_events, drone_statistics_caching, ) from lapis.monitor.cobald import drone_statistics, pool_statistics @@ -44,7 +44,7 @@ def __init__(self, seed=1234): def enable_monitoring(self): self.monitoring.register_statistic(user_demand) self.monitoring.register_statistic(job_statistics) - # self.monitoring.register_statistic(job_events) + self.monitoring.register_statistic(job_events) self.monitoring.register_statistic(pool_statistics) self.monitoring.register_statistic(drone_statistics) self.monitoring.register_statistic(resource_statistics) From e1a69a9683cc1cd9930ac817625a6da6f1abbebc Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Sat, 15 Feb 2020 16:03:04 +0100 Subject: [PATCH 519/648] hot fix to avoid jobs with CPU efficiency > 1 being killed --- lapis/drone.py | 5 ++--- lapis/job.py | 3 +-- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/lapis/drone.py b/lapis/drone.py index c3e07d7..8b8faf1 100644 --- a/lapis/drone.py +++ b/lapis/drone.py @@ -143,9 +143,7 @@ async def _run_job(self, job: Job, kill: bool): job_execution = scope.do(job.run(self)) self.jobs += 1 try: - async with self.resources.claim( - **job.resources - ), self.used_resources.claim(**job.used_resources): + async with self.resources.claim(**job.resources): await sampling_required.put(self) if kill: for resource_key in job.resources: @@ -168,6 +166,7 @@ async def _run_job(self, job: Job, kill: bool): # ) # ) except ResourcesUnavailable: + print(repr(job), "ResourcesUnavailable") await instant job_execution.cancel() await instant diff --git a/lapis/job.py b/lapis/job.py index 800f20c..f7fb064 100644 --- a/lapis/job.py +++ b/lapis/job.py @@ -157,18 +157,17 @@ async def run(self, drone: "Drone"): await sampling_required.put(self) # print("running job {} in drone {}".format(repr(self), repr(self.drone))) try: + start = time.now async with Scope() as scope: await instant scope.do(self._transfer_inputfiles()) scope.do(self._calculate()) except CancelTask: - print("CancelTask") self.drone = None self._success = False # TODO: in_queue_until is still set except BaseException: - print("BaseException") self.drone = None self._success = False # TODO: in_queue_until is still set From f11d8b5a1b6fa9365bba8e6f98b7563b47317c91 Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Sat, 15 Feb 2020 17:49:38 +0100 Subject: [PATCH 520/648] making classad definitions of jobs and machines accessible --- custom_simulate.py | 23 ++++++++++++++--------- custom_simulate_batchsystem.py | 13 +++++++++++-- 2 files changed, 25 insertions(+), 11 deletions(-) diff --git a/custom_simulate.py b/custom_simulate.py index dc21ee8..d8087bd 100644 --- a/custom_simulate.py +++ b/custom_simulate.py @@ -58,6 +58,8 @@ def ini_and_run( calculation_efficiency=1.0, log_telegraf=False, pre_job_rank=pre_job_rank_defaults, + machine_ads=machine_ad_defaults, + job_ads=job_ad_defaults, ): # ini logging to file monitoring_logger = logging.getLogger() @@ -77,11 +79,9 @@ def ini_and_run( # ini simulation print("starting static environment") - simulator = Simulator(seed=time()) + simulator = Simulator(seed=seed) file_type = "htcondor" file = job_file - # print() - # input() simulator.create_job_generator( job_input=open(file, "r"), job_reader=partial( @@ -92,8 +92,8 @@ def ini_and_run( simulator.job_scheduler = CondorClassadJobScheduler( job_queue=simulator.job_queue, pre_job_rank=pre_job_rank, - machine_ad=machine_ad_defaults, - job_ad=job_ad_defaults, + machine_ad=machine_ads, + job_ad=job_ads, ) simulator.create_connection_module(remote_throughput * 1024 * 1024 * 1024) @@ -120,7 +120,10 @@ def ini_and_run( # job_file = "/home/tabea/work/testdata/hitratebased/job_list_minimal.json" -job_file = "/home/tabea/work/testdata/hitratebased/job_list_minimal_only_cpu.json" +# job_file = "/home/tabea/work/testdata/hitratebased/job_list_minimal_only_cpu.json" +job_file = "/home/tabea/work/testdata/fullsim/test_24h_jobinput.json" +# job_file = "/home/tabea/work/testdata/fullsim/resampled_reduced_025week_16_jobinput" \ +# ".json" # pool_files = ["/home/tabea/work/testdata/hitratebased/sg_machines.csv", # "/home/tabea/work/testdata/hitratebased/dummycluster.csv"] # storage_file = "/home/tabea/work/testdata/hitratebased/sg_caches.csv" @@ -134,9 +137,11 @@ def ini_and_run( # job_file = "/home/tabea/work/testdata/hitratebased/week.json" # job_file = "/home/tabea/work/testdata/hitratebased/day_jobinput.json" # job_file = "/home/tabea/work/testdata/hitratebased/week_1_sample_time_jobinput.json" -pool_files = ["/home/tabea/work/testdata/hitratebased/sg_machines_only_cpu.csv"] -# "/home/tabea/work/testdata/hitratebased/dummycluster.csv"] -storage_file = "/home/tabea/work/testdata/hitratebased/sg_caches.csv" +pool_files = [ + "/home/tabea/work/testdata/fullsim/sg_machines_shared_cache.csv", + "/home/tabea/work/testdata/fullsim/dummycluster.csv", +] +storage_file = "/home/tabea/work/testdata/fullsim/sg_caches_shared.csv" storage_type = "filehitrate" ini_and_run( job_file=job_file, diff --git a/custom_simulate_batchsystem.py b/custom_simulate_batchsystem.py index 2786919..907b38c 100644 --- a/custom_simulate_batchsystem.py +++ b/custom_simulate_batchsystem.py @@ -59,6 +59,8 @@ def ini_and_run( calculation_efficiency=1.0, log_telegraf=False, pre_job_rank=pre_job_rank_defaults, + machine_ads=machine_ad_defaults, + job_ads=job_ad_defaults, ): # ini logging to file monitoring_logger = logging.getLogger() @@ -89,11 +91,18 @@ def ini_and_run( ), ) + print( + "scheduler configuration: \n " + "\tpre job rank: {} \n" + "\tmachine classad: {}\n" + "\tjob classad: {}".format(pre_job_rank, machine_ads, job_ads) + ) + simulator.job_scheduler = CondorClassadJobScheduler( job_queue=simulator.job_queue, pre_job_rank=pre_job_rank, - machine_ad=machine_ad_defaults, - job_ad=job_ad_defaults, + machine_ad=machine_ads, + job_ad=job_ads, ) simulator.create_connection_module(remote_throughput * 1024 * 1024 * 1024) From 6454de4d18559b72053c5889b86f4a8e81b53f36 Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Sun, 16 Feb 2020 10:35:10 +0100 Subject: [PATCH 521/648] added classad content for drones and fixed rank inversion --- lapis/scheduler.py | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/lapis/scheduler.py b/lapis/scheduler.py index 34d3642..595b45b 100644 --- a/lapis/scheduler.py +++ b/lapis/scheduler.py @@ -16,6 +16,8 @@ from lapis.job import Job from lapis.monitor import sampling_required +from numpy import mean + class JobQueue(list): pass @@ -79,6 +81,27 @@ def access_wrapped(name, requested=True): return (1 / 1024) * self._temp["disk"] except KeyError: return (1 / 1024) * access_wrapped("disk", requested=False) + elif "cache_demand" in item: + caches = self._wrapped.connection.storages.get( + self._wrapped.sitename, None + ) + try: + return mean( + [cache.connection._throughput_scale for cache in caches] + ) + except TypeError: + return 0 + elif "cache_average_throughput" in item: + caches = self._wrapped.connection.storages.get( + self._wrapped.sitename, None + ) + try: + return sum( + [cache.connection.throughput for cache in caches] + ) / float(access_wrapped("cores")) + except TypeError: + return 0 + return super(WrappedClassAd, self).__getitem__(item) def clear_temporary_resources(self): @@ -346,7 +369,7 @@ def _clustering_key(self, item: WrappedClassAd[DJ]): # TODO: assert that order is consistent quantization = self._quantization return RankedClusterKey( - rank=-self._ranking.evaluate(my=item), + rank=-1.0 * self._ranking.evaluate(my=item), key=tuple( int(quantize(item[key], quantization.get(key, 1))) for key in ("cpus", "memory", "disk") From d657b53d6939484d0f44ac91eb0a9c152d3ddf8b Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Sun, 16 Feb 2020 10:38:51 +0100 Subject: [PATCH 522/648] added definition of more complicated ranks --- custom_simulate.py | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/custom_simulate.py b/custom_simulate.py index d8087bd..07deea8 100644 --- a/custom_simulate.py +++ b/custom_simulate.py @@ -23,16 +23,16 @@ from time import time +pre_job_rank_defaults = "0" + machine_ad_defaults = """ requirements = target.requestcpus <= my.cpus - rank = 0 + rank = 1 """.strip() job_ad_defaults = """ requirements = my.requestcpus <= target.cpus && my.requestmemory <= target.memory -rank = 1 -""" -pre_job_rank_defaults = "0" +rank = 0""" last_step = 0 @@ -120,8 +120,8 @@ def ini_and_run( # job_file = "/home/tabea/work/testdata/hitratebased/job_list_minimal.json" -# job_file = "/home/tabea/work/testdata/hitratebased/job_list_minimal_only_cpu.json" -job_file = "/home/tabea/work/testdata/fullsim/test_24h_jobinput.json" +job_file = "/home/tabea/work/testdata/hitratebased/job_list_minimal_only_cpu.json" +# job_file = "/home/tabea/work/testdata/fullsim/test_12h_jobinput.json" # job_file = "/home/tabea/work/testdata/fullsim/resampled_reduced_025week_16_jobinput" \ # ".json" # pool_files = ["/home/tabea/work/testdata/hitratebased/sg_machines.csv", @@ -150,6 +150,11 @@ def ini_and_run( storage_type=storage_type, log_file="test_new_scheduler.log", log_telegraf=True, - # pre_job_rank="100000 * my.cpus + my.memory - 1000000 - 10000000 * my.rank " - pre_job_rank="1", + pre_job_rank="10000000 * my.Rank + 1000000 - 100000 * my.cpus - my.memory", + machine_ads=""" + requirements = target.requestcpus <= my.cpus + rank = 1 / my.cache_average_throughput + """.strip(), ) + +# rank = my.pipe_utilization + my.average_throughput From cf45d1d123b788f1fb0d90b35a6ffd98c6e78451 Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Sun, 16 Feb 2020 21:01:02 +0100 Subject: [PATCH 523/648] included lookup of cached data, removed shuffeling of drone clusters --- lapis/drone.py | 14 ++++++++++++++ lapis/scheduler.py | 37 +++++++++++++++++++++++++++++++++---- 2 files changed, 47 insertions(+), 4 deletions(-) diff --git a/lapis/drone.py b/lapis/drone.py index 8b8faf1..e041b78 100644 --- a/lapis/drone.py +++ b/lapis/drone.py @@ -52,6 +52,7 @@ def __init__( # caching-related self.jobs_using_caching = 0 + self.cached_data = 0 @property def theoretical_available_resources(self): @@ -180,5 +181,18 @@ async def _run_job(self, job: Job, kill: bool): self.scheduler.update_drone(self) await sampling_required.put(self) + def look_up_cached_data(self, job: Job): + cached_data = 0 + caches = self.connection.storages.get(self.sitename, None) + if caches: + cached_data = sum( + [ + filespecs["hitrates"].get(cache.sitename, 0) * filespecs["filesize"] + for cache in caches + for filespecs in job.requested_inputfiles.values() + ] + ) + self.cached_data = cached_data + def __repr__(self): return "<%s: %s %s>" % (self.__class__.__name__, id(self), self.sitename) diff --git a/lapis/scheduler.py b/lapis/scheduler.py index 595b45b..550ed5f 100644 --- a/lapis/scheduler.py +++ b/lapis/scheduler.py @@ -1,4 +1,4 @@ -import random +# import random from abc import ABC from typing import Dict, Iterator, Tuple, List, TypeVar, Generic, Set, NamedTuple from weakref import WeakKeyDictionary @@ -52,6 +52,8 @@ def __init__(self, classad: ClassAd, wrapped: DJ): self._temp = {} def __getitem__(self, item): + print(item) + def access_wrapped(name, requested=True): if isinstance(self._wrapped, Drone): return self._wrapped.theoretical_available_resources[name] @@ -85,23 +87,42 @@ def access_wrapped(name, requested=True): caches = self._wrapped.connection.storages.get( self._wrapped.sitename, None ) + try: + # print(mean( + # [cache.connection._throughput_scale for cache in caches] + # )) return mean( [cache.connection._throughput_scale for cache in caches] ) except TypeError: + print(0) return 0 - elif "cache_average_throughput" in item: + elif "cache_throughput_per_core" in item: caches = self._wrapped.connection.storages.get( self._wrapped.sitename, None ) + try: + # print(sum( + # [cache.connection.throughput / 1000. / 1000. / 1000. for cache + # in + # caches] + # ) / float(access_wrapped("cores"))) return sum( - [cache.connection.throughput for cache in caches] + [ + cache.connection.throughput / 1000.0 / 1000.0 / 1000.0 + for cache in caches + ] ) / float(access_wrapped("cores")) except TypeError: + print(0) return 0 + elif "cached_data" in item: + # print(self._wrapped, self._wrapped.cached_data / 1000. / 1000. / 1000.) + return self._wrapped.cached_data / 1000.0 / 1000.0 / 1000.0 + return super(WrappedClassAd, self).__getitem__(item) def clear_temporary_resources(self): @@ -365,6 +386,11 @@ def update(self, item): self.remove(item) self.add(item) + def lookup(self, job: Job): + for ranked_key, drones in self._clusters.items(): + for drone in drones: + drone._wrapped.look_up_cached_data(job) + def _clustering_key(self, item: WrappedClassAd[DJ]): # TODO: assert that order is consistent quantization = self._quantization @@ -479,6 +505,7 @@ def _match_job( ] for cluster_group in pre_job_clusters ) + if job["Rank"] != Undefined(): pre_job_clusters = ( sorted( @@ -490,9 +517,10 @@ def _match_job( ) for cluster_group in pre_job_clusters ) + for cluster_group in pre_job_clusters: # TODO: if we have POST_JOB_RANK, collect *all* matches of a group - random.shuffle(cluster_group) # shuffle cluster to remove bias towards cpus + # random.shuffle(cluster_group) # shuffle cluster to remove bias towards cpus for cluster in cluster_group: for drone in cluster: if drone["Requirements"] == Undefined() or drone.evaluate( @@ -508,6 +536,7 @@ async def _schedule_jobs(self): matches: List[Tuple[int, WrappedClassAd[Job], WrappedClassAd[Drone]]] = [] for queue_index, candidate_job in enumerate(self.job_queue): try: + pre_job_drones.lookup(candidate_job._wrapped) matched_drone = self._match_job( candidate_job, pre_job_drones.cluster_groups() ) From ec6b3cc5fcb837aa1006bd192a0ab3f85731ceb5 Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Sun, 16 Feb 2020 23:16:15 +0100 Subject: [PATCH 524/648] removed forgotten print --- lapis/scheduler.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/lapis/scheduler.py b/lapis/scheduler.py index 550ed5f..6bfdb68 100644 --- a/lapis/scheduler.py +++ b/lapis/scheduler.py @@ -52,8 +52,6 @@ def __init__(self, classad: ClassAd, wrapped: DJ): self._temp = {} def __getitem__(self, item): - print(item) - def access_wrapped(name, requested=True): if isinstance(self._wrapped, Drone): return self._wrapped.theoretical_available_resources[name] @@ -96,7 +94,6 @@ def access_wrapped(name, requested=True): [cache.connection._throughput_scale for cache in caches] ) except TypeError: - print(0) return 0 elif "cache_throughput_per_core" in item: caches = self._wrapped.connection.storages.get( @@ -116,7 +113,6 @@ def access_wrapped(name, requested=True): ] ) / float(access_wrapped("cores")) except TypeError: - print(0) return 0 elif "cached_data" in item: From c2259020ab52858257e1dbd3d24a69755e7da76f Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Mon, 17 Feb 2020 19:08:20 +0100 Subject: [PATCH 525/648] fixed look up --- lapis/drone.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/lapis/drone.py b/lapis/drone.py index e041b78..8f26824 100644 --- a/lapis/drone.py +++ b/lapis/drone.py @@ -167,7 +167,7 @@ async def _run_job(self, job: Job, kill: bool): # ) # ) except ResourcesUnavailable: - print(repr(job), "ResourcesUnavailable") + # print(repr(job), "ResourcesUnavailable") await instant job_execution.cancel() await instant @@ -185,13 +185,15 @@ def look_up_cached_data(self, job: Job): cached_data = 0 caches = self.connection.storages.get(self.sitename, None) if caches: - cached_data = sum( - [ - filespecs["hitrates"].get(cache.sitename, 0) * filespecs["filesize"] - for cache in caches - for filespecs in job.requested_inputfiles.values() - ] - ) + if job.requested_inputfiles: + cached_data = sum( + [ + filespecs["hitrates"].get(cache.sitename, 0) + * filespecs["filesize"] + for cache in caches + for filespecs in job.requested_inputfiles.values() + ] + ) self.cached_data = cached_data def __repr__(self): From 124b354e1d9d09d9e6ccf4c5ab4ef29327444f5c Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Mon, 17 Feb 2020 19:27:03 +0100 Subject: [PATCH 526/648] shuffle Autoclusters --- lapis/scheduler.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/lapis/scheduler.py b/lapis/scheduler.py index 6bfdb68..f519568 100644 --- a/lapis/scheduler.py +++ b/lapis/scheduler.py @@ -1,4 +1,4 @@ -# import random +import random from abc import ABC from typing import Dict, Iterator, Tuple, List, TypeVar, Generic, Set, NamedTuple from weakref import WeakKeyDictionary @@ -506,8 +506,9 @@ def _match_job( pre_job_clusters = ( sorted( cluster_group, - key=lambda cluster: job.evaluate( - "Rank", my=job, target=next(iter(cluster)) + key=lambda cluster: ( + job.evaluate("Rank", my=job, target=next(iter(cluster))), + random.random(), ), reverse=True, ) @@ -531,6 +532,8 @@ async def _schedule_jobs(self): pre_job_drones = self._drones.copy() matches: List[Tuple[int, WrappedClassAd[Job], WrappedClassAd[Drone]]] = [] for queue_index, candidate_job in enumerate(self.job_queue): + # if not candidate_job._wrapped.requested_inputfiles: + # continue try: pre_job_drones.lookup(candidate_job._wrapped) matched_drone = self._match_job( From 29168771f3f47a11374579c242f8542729ee1ae7 Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Mon, 17 Feb 2020 19:27:45 +0100 Subject: [PATCH 527/648] added debug output --- lapis/connection.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/lapis/connection.py b/lapis/connection.py index 2a604c3..a1407fe 100644 --- a/lapis/connection.py +++ b/lapis/connection.py @@ -165,7 +165,10 @@ async def transfer_files(self, drone, requested_files: dict, job_repr=None): for file in requested_files.values() ] ) / sum([file["usedsize"] for file in requested_files.values()]) + print(drone, requested_files, random_inputfile_information, hitrate) provides_file = int(random.random() < hitrate) + print(drone, provides_file) + # input() except ZeroDivisionError: provides_file = 0 # print( From fb652492e73e793891ad3749df4575a4435a7597 Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Mon, 17 Feb 2020 20:30:51 +0100 Subject: [PATCH 528/648] added monitoring of cache hitrate scores --- lapis/connection.py | 9 ++++++++- lapis/monitor/caching.py | 31 +++++++++++++++++++++++++++++++ lapis/simulator.py | 3 ++- 3 files changed, 41 insertions(+), 2 deletions(-) diff --git a/lapis/connection.py b/lapis/connection.py index a1407fe..a8a649e 100644 --- a/lapis/connection.py +++ b/lapis/connection.py @@ -13,7 +13,7 @@ from lapis.storageelement import StorageElement, RemoteStorage from lapis.files import RequestedFile, RequestedFile_HitrateBased from lapis.monitor import sampling_required -from lapis.monitor.caching import MonitoredPipeInfo +from lapis.monitor.caching import MonitoredPipeInfo, HitrateInfo class Connection(object): @@ -168,6 +168,13 @@ async def transfer_files(self, drone, requested_files: dict, job_repr=None): print(drone, requested_files, random_inputfile_information, hitrate) provides_file = int(random.random() < hitrate) print(drone, provides_file) + await sampling_required.put( + HitrateInfo( + hitrate, + sum([file["usedsize"] for file in requested_files.values()]), + provides_file, + ) + ) # input() except ZeroDivisionError: provides_file = 0 diff --git a/lapis/monitor/caching.py b/lapis/monitor/caching.py index b19c9a0..5406e32 100644 --- a/lapis/monitor/caching.py +++ b/lapis/monitor/caching.py @@ -17,6 +17,37 @@ class MonitoredPipeInfo(NamedTuple): no_subscriptions: int +class HitrateInfo(NamedTuple): + hitrate: float + volume: float + provides_file: int + + +def hitrate_evaluation(hitrateinfo: HitrateInfo) -> list: + results = [ + { + "hitrate": hitrateinfo.hitrate, + "volume": hitrateinfo.volume, + "providesfile": hitrateinfo.provides_file, + } + ] + return results + + +hitrate_evaluation.name = "hitrate_evaluation" +hitrate_evaluation.whitelist = (HitrateInfo,) +hitrate_evaluation.logging_formatter = { + LoggingSocketHandler.__name__: JsonFormatter(), + # logging.StreamHandler.__name__: JsonFormatter(), + logging.StreamHandler.__name__: LineProtocolFormatter( + tags={"tardis"}, resolution=1 + ), + LoggingUDPSocketHandler.__name__: LineProtocolFormatter( + tags={"tardis"}, resolution=1 + ), +} + + def storage_status(storage: StorageElement) -> list: """ Log information about current storage object state diff --git a/lapis/simulator.py b/lapis/simulator.py index b472811..4aa50b3 100644 --- a/lapis/simulator.py +++ b/lapis/simulator.py @@ -12,7 +12,7 @@ from lapis.drone import Drone from lapis.job import job_to_queue_scheduler from lapis.connection import Connection -from lapis.monitor.caching import storage_status, pipe_status +from lapis.monitor.caching import storage_status, pipe_status, hitrate_evaluation from lapis.monitor.general import ( user_demand, job_statistics, @@ -53,6 +53,7 @@ def enable_monitoring(self): self.monitoring.register_statistic(storage_status) self.monitoring.register_statistic(pipe_status) self.monitoring.register_statistic(drone_statistics_caching) + self.monitoring.register_statistic(hitrate_evaluation) def create_job_generator(self, job_input, job_reader): self._job_generators.append((job_input, job_reader)) From efd7a8cc920f81a69ebcc0ad0d2c03b631e5849e Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Mon, 17 Feb 2020 20:40:09 +0100 Subject: [PATCH 529/648] removed debug output --- lapis/connection.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lapis/connection.py b/lapis/connection.py index a8a649e..39cad42 100644 --- a/lapis/connection.py +++ b/lapis/connection.py @@ -165,9 +165,9 @@ async def transfer_files(self, drone, requested_files: dict, job_repr=None): for file in requested_files.values() ] ) / sum([file["usedsize"] for file in requested_files.values()]) - print(drone, requested_files, random_inputfile_information, hitrate) + # print(drone, requested_files, random_inputfile_information, hitrate) provides_file = int(random.random() < hitrate) - print(drone, provides_file) + # print(drone, provides_file) await sampling_required.put( HitrateInfo( hitrate, From 4edda25032e0bb77a63867c538e0048bd8566a45 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Tue, 18 Feb 2020 13:13:12 +0100 Subject: [PATCH 530/648] configured another connection object to handle dummycluster, fixes #91 --- custom_simulate.py | 40 +++++++++++++++++++++++++++++++++++----- 1 file changed, 35 insertions(+), 5 deletions(-) diff --git a/custom_simulate.py b/custom_simulate.py index 07deea8..e07d32d 100644 --- a/custom_simulate.py +++ b/custom_simulate.py @@ -5,6 +5,8 @@ from cobald.monitor.format_json import JsonFormatter from cobald.monitor.format_line import LineProtocolFormatter +from lapis.connection import Connection +from lapis.drone import Drone from lapis.job_io.htcondor import htcondor_job_reader from lapis.pool import StaticPool from lapis.pool_io.htcondor import htcondor_pool_reader @@ -46,6 +48,19 @@ } +def create_pool_in_simulator(simulator, pool_input, pool_reader, pool_type, connection, + controller=None): + for pool in pool_reader( + iterable=pool_input, + pool_type=pool_type, + make_drone=partial(Drone, simulator.job_scheduler), + connection=connection, + ): + self.pools.append(pool) + if controller: + simulator.controllers.append(controller(target=pool, rate=1)) + + def ini_and_run( job_file, pool_files, @@ -97,6 +112,7 @@ def ini_and_run( ) simulator.create_connection_module(remote_throughput * 1024 * 1024 * 1024) + dummy_pool_connection = Connection(float("Inf")) with open(storage_file, "r") as storage_file: simulator.create_storage( storage_input=storage_file, @@ -108,11 +124,25 @@ def ini_and_run( for pool_file in pool_files: with open(pool_file, "r") as pool_file: pool_file_type = "htcondor" - simulator.create_pools( - pool_input=pool_file, - pool_reader=pool_import_mapper[pool_file_type], - pool_type=StaticPool, - ) + if "dummycluster" in pool_file: + # Attention: dummy_pool_connection is currently not part of + # monitoring as it is not known within the simulator itself + # TODO: do you need this in monitoring? + create_pool_in_simulator( + simulator=simulator, + pool_input=pool_file, + pool_reader=pool_import_mapper[pool_file_type], + pool_type=StaticPool, + connection=dummy_pool_connection + ) + else: + create_pool_in_simulator( + simulator=simulator, + pool_input=pool_file, + pool_reader=pool_import_mapper[pool_file_type], + pool_type=StaticPool, + connection=simulator.connection + ) simulator.enable_monitoring() # run simulation From 5beb994ea78dfdf67cd8dd5eb2b768839554eb4b Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Wed, 19 Feb 2020 09:58:14 +0100 Subject: [PATCH 531/648] fixed create_pool_in_simulator --- custom_simulate.py | 39 ++++++++++++++++++++++++++------------- 1 file changed, 26 insertions(+), 13 deletions(-) diff --git a/custom_simulate.py b/custom_simulate.py index e07d32d..1b55540 100644 --- a/custom_simulate.py +++ b/custom_simulate.py @@ -48,15 +48,16 @@ } -def create_pool_in_simulator(simulator, pool_input, pool_reader, pool_type, connection, - controller=None): +def create_pool_in_simulator( + simulator, pool_input, pool_reader, pool_type, connection, controller=None +): for pool in pool_reader( - iterable=pool_input, - pool_type=pool_type, - make_drone=partial(Drone, simulator.job_scheduler), - connection=connection, + iterable=pool_input, + pool_type=pool_type, + make_drone=partial(Drone, simulator.job_scheduler), + connection=connection, ): - self.pools.append(pool) + simulator.pools.append(pool) if controller: simulator.controllers.append(controller(target=pool, rate=1)) @@ -124,7 +125,7 @@ def ini_and_run( for pool_file in pool_files: with open(pool_file, "r") as pool_file: pool_file_type = "htcondor" - if "dummycluster" in pool_file: + if "dummycluster" in pool_file.name: # Attention: dummy_pool_connection is currently not part of # monitoring as it is not known within the simulator itself # TODO: do you need this in monitoring? @@ -133,7 +134,7 @@ def ini_and_run( pool_input=pool_file, pool_reader=pool_import_mapper[pool_file_type], pool_type=StaticPool, - connection=dummy_pool_connection + connection=dummy_pool_connection, ) else: create_pool_in_simulator( @@ -141,8 +142,9 @@ def ini_and_run( pool_input=pool_file, pool_reader=pool_import_mapper[pool_file_type], pool_type=StaticPool, - connection=simulator.connection + connection=simulator.connection, ) + simulator.enable_monitoring() # run simulation @@ -150,7 +152,9 @@ def ini_and_run( # job_file = "/home/tabea/work/testdata/hitratebased/job_list_minimal.json" -job_file = "/home/tabea/work/testdata/hitratebased/job_list_minimal_only_cpu.json" +job_file = "/home/tabea/work/testdata/modified/job_list_minimal_only_cpu.json" +# job_file = "/home/tabea/work/testdata/modified/single_job.json" +# job_file = "/home/tabea/work/testdata/modified/week_25_1.0_0.0_16_input.json" # job_file = "/home/tabea/work/testdata/fullsim/test_12h_jobinput.json" # job_file = "/home/tabea/work/testdata/fullsim/resampled_reduced_025week_16_jobinput" \ # ".json" @@ -171,20 +175,29 @@ def ini_and_run( "/home/tabea/work/testdata/fullsim/sg_machines_shared_cache.csv", "/home/tabea/work/testdata/fullsim/dummycluster.csv", ] +# pool_files = ["/home/tabea/work/testdata/hitratebased/minimal_pool.csv"] storage_file = "/home/tabea/work/testdata/fullsim/sg_caches_shared.csv" storage_type = "filehitrate" ini_and_run( job_file=job_file, + remote_throughput=0.75, + calculation_efficiency=0.99, pool_files=pool_files, storage_file=storage_file, storage_type=storage_type, log_file="test_new_scheduler.log", log_telegraf=True, - pre_job_rank="10000000 * my.Rank + 1000000 - 100000 * my.cpus - my.memory", + # pre_job_rank="10000000 * my.Rank + 1000000 - 100000 * my.cpus - my.memory", + pre_job_rank="0", machine_ads=""" requirements = target.requestcpus <= my.cpus - rank = 1 / my.cache_average_throughput + rank = 0 """.strip(), + job_ads=""" + Requirements = my.requestcpus <= target.cpus && my.requestmemory <= target.memory + Rank = 0 """, ) +# * (target.cache_demand > 0.1) * target.cache_demand * +# target.cache_throughput # rank = my.pipe_utilization + my.average_throughput From 5952fd4e9159efd021dc10f7b888645a5fbed698 Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Wed, 19 Feb 2020 10:46:51 +0100 Subject: [PATCH 532/648] make custom_simulate and custom_simulate_batchsystem use same function to execute simulation --- custom_simulate.py | 109 ++++++++++++++------------- custom_simulate_batchsystem.py | 132 +-------------------------------- 2 files changed, 63 insertions(+), 178 deletions(-) diff --git a/custom_simulate.py b/custom_simulate.py index 1b55540..969d086 100644 --- a/custom_simulate.py +++ b/custom_simulate.py @@ -105,6 +105,13 @@ def ini_and_run( ), ) + print( + "scheduler configuration: \n " + "\tpre job rank: {} \n" + "\tmachine classad: {}\n" + "\tjob classad: {}".format(pre_job_rank, machine_ads, job_ads) + ) + simulator.job_scheduler = CondorClassadJobScheduler( job_queue=simulator.job_queue, pre_job_rank=pre_job_rank, @@ -112,7 +119,7 @@ def ini_and_run( job_ad=job_ads, ) - simulator.create_connection_module(remote_throughput * 1024 * 1024 * 1024) + simulator.create_connection_module(remote_throughput * 1000 * 1000 * 1000) dummy_pool_connection = Connection(float("Inf")) with open(storage_file, "r") as storage_file: simulator.create_storage( @@ -151,53 +158,55 @@ def ini_and_run( simulator.run(until=until) -# job_file = "/home/tabea/work/testdata/hitratebased/job_list_minimal.json" -job_file = "/home/tabea/work/testdata/modified/job_list_minimal_only_cpu.json" -# job_file = "/home/tabea/work/testdata/modified/single_job.json" -# job_file = "/home/tabea/work/testdata/modified/week_25_1.0_0.0_16_input.json" -# job_file = "/home/tabea/work/testdata/fullsim/test_12h_jobinput.json" -# job_file = "/home/tabea/work/testdata/fullsim/resampled_reduced_025week_16_jobinput" \ -# ".json" -# pool_files = ["/home/tabea/work/testdata/hitratebased/sg_machines.csv", -# "/home/tabea/work/testdata/hitratebased/dummycluster.csv"] -# storage_file = "/home/tabea/work/testdata/hitratebased/sg_caches.csv" -# storage_type = "filehitrate" -# -# ini_and_run(job_file=job_file, pool_files=pool_files, storage_file=storage_file, -# storage_type=storage_type, log_file="minimal_hitratebased_test.log", -# log_telegraf=True) - -# job_file = "/home/tabea/work/testdata/hitratebased/testjobs.json" -# job_file = "/home/tabea/work/testdata/hitratebased/week.json" -# job_file = "/home/tabea/work/testdata/hitratebased/day_jobinput.json" -# job_file = "/home/tabea/work/testdata/hitratebased/week_1_sample_time_jobinput.json" -pool_files = [ - "/home/tabea/work/testdata/fullsim/sg_machines_shared_cache.csv", - "/home/tabea/work/testdata/fullsim/dummycluster.csv", -] -# pool_files = ["/home/tabea/work/testdata/hitratebased/minimal_pool.csv"] -storage_file = "/home/tabea/work/testdata/fullsim/sg_caches_shared.csv" -storage_type = "filehitrate" -ini_and_run( - job_file=job_file, - remote_throughput=0.75, - calculation_efficiency=0.99, - pool_files=pool_files, - storage_file=storage_file, - storage_type=storage_type, - log_file="test_new_scheduler.log", - log_telegraf=True, - # pre_job_rank="10000000 * my.Rank + 1000000 - 100000 * my.cpus - my.memory", - pre_job_rank="0", - machine_ads=""" - requirements = target.requestcpus <= my.cpus - rank = 0 - """.strip(), - job_ads=""" - Requirements = my.requestcpus <= target.cpus && my.requestmemory <= target.memory - Rank = 0 """, -) +if __name__ == "__main__": + + # job_file = "/home/tabea/work/testdata/hitratebased/job_list_minimal.json" + job_file = "/home/tabea/work/testdata/modified/job_list_minimal_only_cpu.json" + # job_file = "/home/tabea/work/testdata/modified/single_job.json" + # job_file = "/home/tabea/work/testdata/modified/week_25_1.0_0.0_16_input.json" + # job_file = "/home/tabea/work/testdata/fullsim/test_12h_jobinput.json" + # job_file = "/home/tabea/work/testdata/fullsim/resampled_reduced_025week_16_jobinput" \ + # ".json" + # pool_files = ["/home/tabea/work/testdata/hitratebased/sg_machines.csv", + # "/home/tabea/work/testdata/hitratebased/dummycluster.csv"] + # storage_file = "/home/tabea/work/testdata/hitratebased/sg_caches.csv" + # storage_type = "filehitrate" + # + # ini_and_run(job_file=job_file, pool_files=pool_files, storage_file=storage_file, + # storage_type=storage_type, log_file="minimal_hitratebased_test.log", + # log_telegraf=True) + + # job_file = "/home/tabea/work/testdata/hitratebased/testjobs.json" + # job_file = "/home/tabea/work/testdata/hitratebased/week.json" + # job_file = "/home/tabea/work/testdata/hitratebased/day_jobinput.json" + # job_file = "/home/tabea/work/testdata/hitratebased/week_1_sample_time_jobinput.json" + pool_files = [ + "/home/tabea/work/testdata/fullsim/sg_machines_shared_cache.csv", + "/home/tabea/work/testdata/fullsim/dummycluster.csv", + ] + # pool_files = ["/home/tabea/work/testdata/hitratebased/minimal_pool.csv"] + storage_file = "/home/tabea/work/testdata/fullsim/sg_caches_shared.csv" + storage_type = "filehitrate" + ini_and_run( + job_file=job_file, + remote_throughput=0.75, + calculation_efficiency=0.99, + pool_files=pool_files, + storage_file=storage_file, + storage_type=storage_type, + log_file="test_new_scheduler.log", + log_telegraf=True, + # pre_job_rank="10000000 * my.Rank + 1000000 - 100000 * my.cpus - my.memory", + pre_job_rank="0", + machine_ads=""" + requirements = target.requestcpus <= my.cpus + rank = 0 + """.strip(), + job_ads=""" + Requirements = my.requestcpus <= target.cpus && my.requestmemory <= target.memory + Rank = 0 """, + ) -# * (target.cache_demand > 0.1) * target.cache_demand * -# target.cache_throughput -# rank = my.pipe_utilization + my.average_throughput + # * (target.cache_demand > 0.1) * target.cache_demand * + # target.cache_throughput + # rank = my.pipe_utilization + my.average_throughput diff --git a/custom_simulate_batchsystem.py b/custom_simulate_batchsystem.py index 907b38c..f6cc4c9 100644 --- a/custom_simulate_batchsystem.py +++ b/custom_simulate_batchsystem.py @@ -1,132 +1,5 @@ -from functools import partial - -import logging.handlers - -from cobald.monitor.format_json import JsonFormatter -from cobald.monitor.format_line import LineProtocolFormatter - -from lapis.job_io.htcondor import htcondor_job_reader -from lapis.pool import StaticPool -from lapis.pool_io.htcondor import htcondor_pool_reader -from lapis.job_io.swf import swf_job_reader -from lapis.storageelement import FileBasedHitrateStorage -from lapis.storage_io.storage import ( - storage_reader, - storage_reader_filebased_hitrate_caching, -) - -from lapis.scheduler import CondorClassadJobScheduler -from lapis.simulator import Simulator - import sys - -from lapis.monitor import LoggingUDPSocketHandler, SimulationTimeFilter - -# from time import time - -machine_ad_defaults = """ - requirements = target.requestcpus <= my.cpus - rank = 0 - """.strip() - -job_ad_defaults = """ -requirements = my.requestcpus <= target.cpus && my.requestmemory <= target.memory -rank = 1 -""" -pre_job_rank_defaults = "0" - -last_step = 0 - -job_import_mapper = {"htcondor": htcondor_job_reader, "swf": swf_job_reader} - -pool_import_mapper = {"htcondor": htcondor_pool_reader} - -storage_import_mapper = { - "standard": storage_reader, - "filehitrate": storage_reader_filebased_hitrate_caching, -} - - -def ini_and_run( - job_file, - pool_files, - storage_file, - storage_type, - log_file="test.log", - remote_throughput=1.0, - seed=1234, - until=None, - calculation_efficiency=1.0, - log_telegraf=False, - pre_job_rank=pre_job_rank_defaults, - machine_ads=machine_ad_defaults, - job_ads=job_ad_defaults, -): - # ini logging to file - monitoring_logger = logging.getLogger() - monitoring_logger.setLevel(logging.DEBUG) - time_filter = SimulationTimeFilter() - monitoring_logger.addFilter(time_filter) - streamHandler = logging.StreamHandler(stream=open(log_file, "w")) - streamHandler.setFormatter(JsonFormatter()) - monitoring_logger.addHandler(streamHandler) - - if log_telegraf: - telegrafHandler = LoggingUDPSocketHandler( - "localhost", logging.handlers.DEFAULT_UDP_LOGGING_PORT - ) - telegrafHandler.setFormatter(LineProtocolFormatter(resolution=1)) - monitoring_logger.addHandler(telegrafHandler) - - # ini simulation - print("starting static environment") - simulator = Simulator(seed=seed) - file_type = "htcondor" - file = job_file - - simulator.create_job_generator( - job_input=open(file, "r"), - job_reader=partial( - job_import_mapper[file_type], calculation_efficiency=calculation_efficiency - ), - ) - - print( - "scheduler configuration: \n " - "\tpre job rank: {} \n" - "\tmachine classad: {}\n" - "\tjob classad: {}".format(pre_job_rank, machine_ads, job_ads) - ) - - simulator.job_scheduler = CondorClassadJobScheduler( - job_queue=simulator.job_queue, - pre_job_rank=pre_job_rank, - machine_ad=machine_ads, - job_ad=job_ads, - ) - - simulator.create_connection_module(remote_throughput * 1024 * 1024 * 1024) - with open(storage_file, "r") as storage_file: - simulator.create_storage( - storage_input=storage_file, - storage_content_input=None, - storage_reader=storage_import_mapper[storage_type], - storage_type=FileBasedHitrateStorage, - ) - - for pool_file in pool_files: - with open(pool_file, "r") as pool_file: - pool_file_type = "htcondor" - simulator.create_pools( - pool_input=pool_file, - pool_reader=pool_import_mapper[pool_file_type], - pool_type=StaticPool, - ) - simulator.enable_monitoring() - - # run simulation - simulator.run(until=until) - +from custom_simulate import ini_and_run ini_and_run( job_file=sys.argv[1], @@ -137,4 +10,7 @@ def ini_and_run( remote_throughput=float(sys.argv[6]), calculation_efficiency=float(sys.argv[7]), log_telegraf=False, + pre_job_rank=sys.argv[8], + machine_ads=sys.argv[9], + job_ads=sys.argv[10], ) From 4da4cce50e6be04b2ee55cbde111166a8858ea70 Mon Sep 17 00:00:00 2001 From: Max Fischer Date: Mon, 17 Feb 2020 18:35:20 +0100 Subject: [PATCH 533/648] added fake auto-clusters --- lapis/scheduler.py | 58 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) diff --git a/lapis/scheduler.py b/lapis/scheduler.py index f519568..d9c2e58 100644 --- a/lapis/scheduler.py +++ b/lapis/scheduler.py @@ -419,6 +419,64 @@ def cluster_groups(self) -> Iterator[List[Set[WrappedClassAd[Drone]]]]: yield group +class RankedNonClusters(Generic[DJ]): + """Automatically cluster jobs or drones by rank only""" + + def __init__(self, quantization: Dict[str, HTCInt], ranking: Expression): + self._quantization = quantization + self._ranking = ranking + self._clusters: Dict[float, Set[WrappedClassAd[DJ]]] = SortedDict() + self._inverse: Dict[WrappedClassAd[DJ], float] = {} + + def copy(self) -> "RankedNonClusters[DJ]": + """Copy the entire ranked auto clusters""" + clone = type(self)(quantization=self._quantization, ranking=self._ranking) + clone._clusters = SortedDict( + (key, value.copy()) for key, value in self._clusters.items() + ) + clone._inverse = self._inverse.copy() + return clone + + def add(self, item: WrappedClassAd[DJ]): + """Add a new item""" + if item in self._inverse: + raise ValueError(f"{item!r} already stored; use `.update(item)` instead") + item_key = self._clustering_key(item) + try: + self._clusters[item_key].add(item) + except KeyError: + self._clusters[item_key] = {item} + self._inverse[item] = item_key + + def remove(self, item: WrappedClassAd[DJ]): + """Remove an existing item""" + item_key = self._inverse.pop(item) + cluster = self._clusters[item_key] + cluster.remove(item) + if not cluster: + del self._clusters[item_key] + + def update(self, item): + """Update an existing item with its current state""" + self.remove(item) + self.add(item) + + def _clustering_key(self, item: WrappedClassAd[DJ]): + # TODO: assert that order is consistent + return -1.0 * self._ranking.evaluate(my=item) + + def clusters(self) -> Iterator[Set[WrappedClassAd[DJ]]]: + return iter(self._clusters.values()) + + def items(self) -> Iterator[Tuple[float, Set[WrappedClassAd[DJ]]]]: + return iter(self._clusters.items()) + + def cluster_groups(self) -> Iterator[List[Set[WrappedClassAd[Drone]]]]: + """Group autoclusters by PreJobRank""" + for ranked_key, drones in self._clusters.items(): + yield [{item} for item in drones] + + class CondorClassadJobScheduler(JobScheduler): """ Goal of the htcondor job scheduler is to have a scheduler that somehow From 4087ccec91b101f75262cb3d66688611ca1c2b5e Mon Sep 17 00:00:00 2001 From: Max Fischer Date: Mon, 17 Feb 2020 18:50:11 +0100 Subject: [PATCH 534/648] added interface for ranked autoclusters --- lapis/scheduler.py | 69 ++++++++++++++++++++++++++++++++-------------- 1 file changed, 49 insertions(+), 20 deletions(-) diff --git a/lapis/scheduler.py b/lapis/scheduler.py index d9c2e58..fb7c31e 100644 --- a/lapis/scheduler.py +++ b/lapis/scheduler.py @@ -1,6 +1,6 @@ import random -from abc import ABC -from typing import Dict, Iterator, Tuple, List, TypeVar, Generic, Set, NamedTuple +from abc import ABC, abstractmethod +from typing import Dict, Iterator, Tuple, List, TypeVar, Generic, Set, NamedTuple, Any from weakref import WeakKeyDictionary from sortedcontainers import SortedDict @@ -340,7 +340,51 @@ class RankedClusterKey(NamedTuple): key: Tuple[float, ...] -class RankedAutoClusters(Generic[DJ]): +RC = TypeVar("RC", bound="RankedClusters") + + +class RankedClusters(Generic[DJ]): + """Automatically cluster drones by rank""" + + @abstractmethod + def __init__(self, quantization: Dict[str, HTCInt], ranking: Expression): + raise NotImplementedError + + @abstractmethod + def copy(self: RC[DJ]) -> RC[DJ]: + """Copy the entire ranked auto clusters""" + raise NotImplementedError + + @abstractmethod + def add(self, item: WrappedClassAd[DJ]) -> None: + """Add a new item""" + raise NotImplementedError + + @abstractmethod + def remove(self, item: WrappedClassAd[DJ]) -> None: + """Remove an existing item""" + raise NotImplementedError + + def update(self, item) -> None: + """Update an existing item with its current state""" + self.remove(item) + self.add(item) + + @abstractmethod + def clusters(self) -> Iterator[Set[WrappedClassAd[DJ]]]: + raise NotImplementedError + + @abstractmethod + def items(self) -> Iterator[Tuple[Any, Set[WrappedClassAd[DJ]]]]: + raise NotImplementedError + + @abstractmethod + def cluster_groups(self) -> Iterator[List[Set[WrappedClassAd[Drone]]]]: + """Group autoclusters by PreJobRank""" + raise NotImplementedError + + +class RankedAutoClusters(RankedClusters[DJ]): """Automatically cluster similar jobs or drones""" def __init__(self, quantization: Dict[str, HTCInt], ranking: Expression): @@ -350,7 +394,6 @@ def __init__(self, quantization: Dict[str, HTCInt], ranking: Expression): self._inverse: Dict[WrappedClassAd[DJ], RankedClusterKey] = {} def copy(self) -> "RankedAutoClusters[DJ]": - """Copy the entire ranked auto clusters""" clone = type(self)(quantization=self._quantization, ranking=self._ranking) clone._clusters = SortedDict( (key, value.copy()) for key, value in self._clusters.items() @@ -359,7 +402,6 @@ def copy(self) -> "RankedAutoClusters[DJ]": return clone def add(self, item: WrappedClassAd[DJ]): - """Add a new item""" if item in self._inverse: raise ValueError(f"{item!r} already stored; use `.update(item)` instead") item_key = self._clustering_key(item) @@ -370,18 +412,12 @@ def add(self, item: WrappedClassAd[DJ]): self._inverse[item] = item_key def remove(self, item: WrappedClassAd[DJ]): - """Remove an existing item""" item_key = self._inverse.pop(item) cluster = self._clusters[item_key] cluster.remove(item) if not cluster: del self._clusters[item_key] - def update(self, item): - """Update an existing item with its current state""" - self.remove(item) - self.add(item) - def lookup(self, job: Job): for ranked_key, drones in self._clusters.items(): for drone in drones: @@ -405,7 +441,6 @@ def items(self) -> Iterator[Tuple[RankedClusterKey, Set[WrappedClassAd[DJ]]]]: return iter(self._clusters.items()) def cluster_groups(self) -> Iterator[List[Set[WrappedClassAd[Drone]]]]: - """Group autoclusters by PreJobRank""" group = [] current_rank = None for ranked_key, drones in self._clusters.items(): @@ -419,7 +454,7 @@ def cluster_groups(self) -> Iterator[List[Set[WrappedClassAd[Drone]]]]: yield group -class RankedNonClusters(Generic[DJ]): +class RankedNonClusters(RankedClusters[DJ]): """Automatically cluster jobs or drones by rank only""" def __init__(self, quantization: Dict[str, HTCInt], ranking: Expression): @@ -429,7 +464,6 @@ def __init__(self, quantization: Dict[str, HTCInt], ranking: Expression): self._inverse: Dict[WrappedClassAd[DJ], float] = {} def copy(self) -> "RankedNonClusters[DJ]": - """Copy the entire ranked auto clusters""" clone = type(self)(quantization=self._quantization, ranking=self._ranking) clone._clusters = SortedDict( (key, value.copy()) for key, value in self._clusters.items() @@ -438,7 +472,6 @@ def copy(self) -> "RankedNonClusters[DJ]": return clone def add(self, item: WrappedClassAd[DJ]): - """Add a new item""" if item in self._inverse: raise ValueError(f"{item!r} already stored; use `.update(item)` instead") item_key = self._clustering_key(item) @@ -449,7 +482,6 @@ def add(self, item: WrappedClassAd[DJ]): self._inverse[item] = item_key def remove(self, item: WrappedClassAd[DJ]): - """Remove an existing item""" item_key = self._inverse.pop(item) cluster = self._clusters[item_key] cluster.remove(item) @@ -457,12 +489,10 @@ def remove(self, item: WrappedClassAd[DJ]): del self._clusters[item_key] def update(self, item): - """Update an existing item with its current state""" self.remove(item) self.add(item) def _clustering_key(self, item: WrappedClassAd[DJ]): - # TODO: assert that order is consistent return -1.0 * self._ranking.evaluate(my=item) def clusters(self) -> Iterator[Set[WrappedClassAd[DJ]]]: @@ -472,7 +502,6 @@ def items(self) -> Iterator[Tuple[float, Set[WrappedClassAd[DJ]]]]: return iter(self._clusters.items()) def cluster_groups(self) -> Iterator[List[Set[WrappedClassAd[Drone]]]]: - """Group autoclusters by PreJobRank""" for ranked_key, drones in self._clusters.items(): yield [{item} for item in drones] @@ -502,7 +531,7 @@ def __init__( interval: float = 60, ): self._stream_queue = job_queue - self._drones: RankedAutoClusters[Drone] = RankedAutoClusters( + self._drones: RankedClusters[Drone] = RankedAutoClusters( quantization=quantization_defaults, ranking=parse(pre_job_rank) ) self.interval = interval From 73b5e888786449e0702d1fdf99516261d3c8169c Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Wed, 19 Feb 2020 10:59:31 +0100 Subject: [PATCH 535/648] adapted RankClusters to caching functionality --- lapis/scheduler.py | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/lapis/scheduler.py b/lapis/scheduler.py index fb7c31e..4b988ab 100644 --- a/lapis/scheduler.py +++ b/lapis/scheduler.py @@ -350,10 +350,10 @@ class RankedClusters(Generic[DJ]): def __init__(self, quantization: Dict[str, HTCInt], ranking: Expression): raise NotImplementedError - @abstractmethod - def copy(self: RC[DJ]) -> RC[DJ]: - """Copy the entire ranked auto clusters""" - raise NotImplementedError + # @abstractmethod + # def copy(self: RC[DJ]) -> RC[DJ]: + # """Copy the entire ranked auto clusters""" + # raise NotImplementedError @abstractmethod def add(self, item: WrappedClassAd[DJ]) -> None: @@ -383,6 +383,11 @@ def cluster_groups(self) -> Iterator[List[Set[WrappedClassAd[Drone]]]]: """Group autoclusters by PreJobRank""" raise NotImplementedError + @abstractmethod + def lookup(self, job: Job) -> None: + """Update information about cached data for every drone""" + raise NotImplementedError + class RankedAutoClusters(RankedClusters[DJ]): """Automatically cluster similar jobs or drones""" @@ -418,11 +423,6 @@ def remove(self, item: WrappedClassAd[DJ]): if not cluster: del self._clusters[item_key] - def lookup(self, job: Job): - for ranked_key, drones in self._clusters.items(): - for drone in drones: - drone._wrapped.look_up_cached_data(job) - def _clustering_key(self, item: WrappedClassAd[DJ]): # TODO: assert that order is consistent quantization = self._quantization @@ -453,6 +453,11 @@ def cluster_groups(self) -> Iterator[List[Set[WrappedClassAd[Drone]]]]: if group: yield group + def lookup(self, job: Job): + for ranked_key, drones in self._clusters.items(): + for drone in drones: + drone._wrapped.look_up_cached_data(job) + class RankedNonClusters(RankedClusters[DJ]): """Automatically cluster jobs or drones by rank only""" @@ -505,6 +510,11 @@ def cluster_groups(self) -> Iterator[List[Set[WrappedClassAd[Drone]]]]: for ranked_key, drones in self._clusters.items(): yield [{item} for item in drones] + def lookup(self, job: Job): + for ranked_key, drones in self._clusters.items(): + for drone in drones: + drone._wrapped.look_up_cached_data(job) + class CondorClassadJobScheduler(JobScheduler): """ From 81e2e17ececf23d412b884feb195c31c9ef5ad21 Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Wed, 19 Feb 2020 14:51:10 +0100 Subject: [PATCH 536/648] avoid walltime recalculation for jobs without inputfiles and jobs running on the dummy cluster --- lapis/job.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/lapis/job.py b/lapis/job.py index f7fb064..509d7f8 100644 --- a/lapis/job.py +++ b/lapis/job.py @@ -122,7 +122,11 @@ async def _calculate(self): # ) result = self.walltime try: - if not self.requested_inputfiles: + if ( + not self.requested_inputfiles + or self.drone.connection.remote_connection.connection.throughput + == float("Inf") + ): raise KeyError result = ( self.used_resources["cores"] / self.calculation_efficiency From bc1a0dcc65c0e1e323e85e6487cf0db636d444ee Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Wed, 19 Feb 2020 15:58:21 +0100 Subject: [PATCH 537/648] changed unit of filesizes/throughput in caching monitoring to GB --- lapis/monitor/caching.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/lapis/monitor/caching.py b/lapis/monitor/caching.py index 5406e32..c661ac2 100644 --- a/lapis/monitor/caching.py +++ b/lapis/monitor/caching.py @@ -27,7 +27,7 @@ def hitrate_evaluation(hitrateinfo: HitrateInfo) -> list: results = [ { "hitrate": hitrateinfo.hitrate, - "volume": hitrateinfo.volume, + "volume": hitrateinfo.volume / 1000.0 / 1000.0 / 1000.0, "providesfile": hitrateinfo.provides_file, } ] @@ -88,8 +88,11 @@ def pipe_status(pipeinfo: MonitoredPipeInfo) -> list: results = [ { "pipe": repr(pipeinfo.pipename), - "throughput": pipeinfo.available_throughput, - "requested_throughput": pipeinfo.requested_throughput, + "throughput": pipeinfo.available_throughput / 1000.0 / 1000.0 / 1000.0, + "requested_throughput": pipeinfo.requested_throughput + / 1000.0 + / 1000.0 + / 1000.0, "throughput_scale": pipeinfo.throughputscale, "no_subscribers": pipeinfo.no_subscriptions, } From 3a6c0aaf9e44788234d9bdfe5aaf1beccc55a7d5 Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Wed, 19 Feb 2020 16:31:35 +0100 Subject: [PATCH 538/648] decompressed evaluation of job rank/requirement for debugging purposes --- lapis/scheduler.py | 54 +++++++++++++++++++++++++++------------------- 1 file changed, 32 insertions(+), 22 deletions(-) diff --git a/lapis/scheduler.py b/lapis/scheduler.py index 4b988ab..dda0733 100644 --- a/lapis/scheduler.py +++ b/lapis/scheduler.py @@ -539,11 +539,21 @@ def __init__( job_ad: str = job_ad_defaults, pre_job_rank: str = "0", interval: float = 60, + autocluster: bool = False, ): self._stream_queue = job_queue - self._drones: RankedClusters[Drone] = RankedAutoClusters( + self._drones: RankedClusters[Drone] = RankedNonClusters( quantization=quantization_defaults, ranking=parse(pre_job_rank) ) + # if autocluster: + # self._drones: RankedClusters[Drone] = RankedAutoClusters( + # quantization=quantization_defaults, ranking=parse(pre_job_rank) + # ) + # else: + # self._drones: RankedClusters[Drone] = RankedNonClusters( + # quantization=quantization_defaults, ranking=parse(pre_job_rank) + # ) + self.interval = interval self.job_queue = JobQueue() self._collecting = True @@ -590,31 +600,33 @@ def _match_job( job: ClassAd, pre_job_clusters: Iterator[List[Set[WrappedClassAd[Drone]]]] ): if job["Requirements"] != Undefined(): - pre_job_clusters = ( - [ - cluster - for cluster in cluster_group - if job.evaluate("Requirements", my=job, target=next(iter(cluster))) - ] - for cluster_group in pre_job_clusters - ) + pre_job_clusters_tmp = [] + for cluster_group in pre_job_clusters: + cluster_group_tmp = [] + for cluster in cluster_group: + if job.evaluate("Requirements", my=job, target=next(iter(cluster))): + cluster_group_tmp.append(cluster) + pre_job_clusters_tmp.append(cluster_group_tmp) + pre_job_clusters = pre_job_clusters_tmp if job["Rank"] != Undefined(): - pre_job_clusters = ( - sorted( - cluster_group, - key=lambda cluster: ( - job.evaluate("Rank", my=job, target=next(iter(cluster))), - random.random(), - ), - reverse=True, + pre_job_clusters_tmp = [] + for cluster_group in pre_job_clusters: + pre_job_clusters_tmp.append( + sorted( + cluster_group, + key=lambda cluster: ( + job.evaluate("Rank", my=job, target=next(iter(cluster))), + random.random(), + ), + reverse=True, + ) ) - for cluster_group in pre_job_clusters - ) + + pre_job_clusters = pre_job_clusters_tmp for cluster_group in pre_job_clusters: # TODO: if we have POST_JOB_RANK, collect *all* matches of a group - # random.shuffle(cluster_group) # shuffle cluster to remove bias towards cpus for cluster in cluster_group: for drone in cluster: if drone["Requirements"] == Undefined() or drone.evaluate( @@ -629,8 +641,6 @@ async def _schedule_jobs(self): pre_job_drones = self._drones.copy() matches: List[Tuple[int, WrappedClassAd[Job], WrappedClassAd[Drone]]] = [] for queue_index, candidate_job in enumerate(self.job_queue): - # if not candidate_job._wrapped.requested_inputfiles: - # continue try: pre_job_drones.lookup(candidate_job._wrapped) matched_drone = self._match_job( From 540f8c054ebd912869c37d65013d1aae5a1a4a25 Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Thu, 20 Feb 2020 10:23:20 +0100 Subject: [PATCH 539/648] disabled cache algorithm evaluation in case of no filebased caching --- lapis/connection.py | 52 +++++++++++++++++++++++++-------------------- 1 file changed, 29 insertions(+), 23 deletions(-) diff --git a/lapis/connection.py b/lapis/connection.py index 39cad42..a30d58c 100644 --- a/lapis/connection.py +++ b/lapis/connection.py @@ -18,9 +18,14 @@ class Connection(object): - __slots__ = ("storages", "remote_connection", "caching_algorithm") - - def __init__(self, throughput=1000 * 1000 * 1000): + __slots__ = ( + "storages", + "remote_connection", + "caching_algorithm", + "_filebased_caching", + ) + + def __init__(self, throughput=1000 * 1000 * 1000, filebased_caching=True): self.storages = dict() self.remote_connection = RemoteStorage(MonitoredPipe(throughput=throughput)) self.caching_algorithm = CacheAlgorithm( @@ -30,6 +35,7 @@ def __init__(self, throughput=1000 * 1000 * 1000): file, storage ), ) + self._filebased_caching = filebased_caching async def run_pipemonitoring(self): async def report_load_to_monitoring(pipe: MonitoredPipe): @@ -114,27 +120,27 @@ async def stream_file( used_connection = await self._determine_inputfile_source( requested_file, dronesite, job_repr ) - # await sampling_required.put(used_connection) - if used_connection == self.remote_connection and self.storages.get( - dronesite, None - ): - try: - potential_cache = random.choice(self.storages[dronesite]) - cache_file, files_for_deletion = self.caching_algorithm.consider( - file=requested_file, storage=potential_cache - ) - if cache_file: - for file in files_for_deletion: - await potential_cache.remove(file, job_repr) - await potential_cache.add(requested_file, job_repr) - else: - print( - f"APPLY CACHING DECISION: Job {job_repr}, " - f"File {requested_file.filename}: File wasnt " - f"cached @ {time.now}" + if self._filebased_caching: + if used_connection == self.remote_connection and self.storages.get( + dronesite, None + ): + try: + potential_cache = random.choice(self.storages[dronesite]) + cache_file, files_for_deletion = self.caching_algorithm.consider( + file=requested_file, storage=potential_cache ) - except KeyError: - pass + if cache_file: + for file in files_for_deletion: + await potential_cache.remove(file, job_repr) + await potential_cache.add(requested_file, job_repr) + else: + print( + f"APPLY CACHING DECISION: Job {job_repr}, " + f"File {requested_file.filename}: File wasnt " + f"cached @ {time.now}" + ) + except KeyError: + pass # print(f"now transfering {requested_file.filesize} from {used_connection}") await used_connection.transfer(requested_file, job_repr=job_repr) # print( From 0685cf5f7203c95bd6122ea0d98724ce429bd952 Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Thu, 20 Feb 2020 15:58:16 +0100 Subject: [PATCH 540/648] added monitoring information --- lapis/connection.py | 19 ++++--- lapis/drone.py | 6 +- lapis/job.py | 27 ++++++--- lapis/monitor/caching.py | 117 +++++++++++++++++++++------------------ lapis/monitor/general.py | 17 +++++- lapis/scheduler.py | 20 ++++++- lapis/simulator.py | 11 +++- lapis/storageelement.py | 7 ++- monitoredpipe.py | 9 ++- 9 files changed, 156 insertions(+), 77 deletions(-) diff --git a/lapis/connection.py b/lapis/connection.py index a30d58c..d6c68d6 100644 --- a/lapis/connection.py +++ b/lapis/connection.py @@ -40,6 +40,14 @@ def __init__(self, throughput=1000 * 1000 * 1000, filebased_caching=True): async def run_pipemonitoring(self): async def report_load_to_monitoring(pipe: MonitoredPipe): async for throughput in pipe.load(): + print( + time.now, + "registered change, sending to monitoring", + time.now, + pipe, + pipe._subscriptions, + pipe._throughput_scale, + ) await sampling_required.put( MonitoredPipeInfo( throughput, @@ -149,7 +157,7 @@ async def stream_file( # ) # ) - async def transfer_files(self, drone, requested_files: dict, job_repr=None): + async def transfer_files(self, drone, requested_files: dict, job_repr): """ Converts dict information about requested files to RequestedFile object and sequentially streams all files @@ -171,9 +179,7 @@ async def transfer_files(self, drone, requested_files: dict, job_repr=None): for file in requested_files.values() ] ) / sum([file["usedsize"] for file in requested_files.values()]) - # print(drone, requested_files, random_inputfile_information, hitrate) provides_file = int(random.random() < hitrate) - # print(drone, provides_file) await sampling_required.put( HitrateInfo( hitrate, @@ -184,11 +190,8 @@ async def transfer_files(self, drone, requested_files: dict, job_repr=None): # input() except ZeroDivisionError: provides_file = 0 - # print( - # "{} on {} hitrate {} => {}".format( - # requested_files, drone.sitename, hitrate, provides_file - # ) - # ) + + job_repr._read_from_cache = provides_file for inputfilename, inputfilespecs in requested_files.items(): if "hitrates" in inputfilespecs.keys(): diff --git a/lapis/drone.py b/lapis/drone.py index 8f26824..5c3ca64 100644 --- a/lapis/drone.py +++ b/lapis/drone.py @@ -51,7 +51,7 @@ def __init__( self._job_queue = Queue() # caching-related - self.jobs_using_caching = 0 + self.jobs_with_cached_data = 0 self.cached_data = 0 @property @@ -143,6 +143,8 @@ async def _run_job(self, job: Job, kill: bool): job_execution = scope.do(job.run(self)) self.jobs += 1 + if job._cached_data: + self.jobs_with_cached_data += 1 try: async with self.resources.claim(**job.resources): await sampling_required.put(self) @@ -176,6 +178,8 @@ async def _run_job(self, job: Job, kill: bool): job_execution.cancel() await instant self.jobs -= 1 + if job._cached_data: + self.jobs_with_cached_data -= 1 await self.scheduler.job_finished(job) self._utilisation = self._allocation = None self.scheduler.update_drone(self) diff --git a/lapis/job.py b/lapis/job.py index 509d7f8..308edd9 100644 --- a/lapis/job.py +++ b/lapis/job.py @@ -26,9 +26,14 @@ class Job(object): "_success", "calculation_efficiency", "__weakref__", - "_coordinated", - "_used_cache", + "_read_from_cache", + "_cached_data", "_total_input_data", + "_original_walltime", + "_calculation_time", + "_transfer_time", + "failed_matches", + "cputime", ) def __init__( @@ -79,14 +84,19 @@ def __init__( # caching-related self.requested_inputfiles = resources.pop("inputfiles", None) self.used_inputfiles = used_resources.pop("inputfiles", None) - self._coordinated = 0 - self._used_cache = 0 + self._read_from_cache = 0 + self._cached_data = 0 + self._original_walltime = self.walltime + self._calculation_time = 0 + self._transfer_time = 0 + self.cputime = self.used_resources["cores"] * self.walltime try: self._total_input_data = sum( [fileinfo["usedsize"] for fileinfo in self.used_inputfiles.values()] ) except AttributeError: self._total_input_data = 0 + self.failed_matches = 0 @property def name(self) -> str: @@ -131,6 +141,8 @@ async def _calculate(self): result = ( self.used_resources["cores"] / self.calculation_efficiency ) * self.walltime + self._calculation_time = result + except (KeyError, TypeError): pass # start = time.now @@ -138,13 +150,11 @@ async def _calculate(self): # print(f"finished calculation at {time.now - start}") async def _transfer_inputfiles(self): + start = time.now try: - # start = time.now # print(f"TRANSFERING INPUTFILES: Job {self} @ {start}") await self.drone.connection.transfer_files( - drone=self.drone, - requested_files=self.used_inputfiles, - job_repr=repr(self), + drone=self.drone, requested_files=self.used_inputfiles, job_repr=self ) # print( # f"streamed inputfiles {self.used_inputfiles.keys()} for job {self} " @@ -152,6 +162,7 @@ async def _transfer_inputfiles(self): # ) except AttributeError: pass + self._transfer_time = time.now - start async def run(self, drone: "Drone"): assert drone, "Jobs cannot run without a drone being assigned" diff --git a/lapis/monitor/caching.py b/lapis/monitor/caching.py index c661ac2..eb73731 100644 --- a/lapis/monitor/caching.py +++ b/lapis/monitor/caching.py @@ -5,8 +5,16 @@ from cobald.monitor.format_json import JsonFormatter from cobald.monitor.format_line import LineProtocolFormatter -from lapis.monitor import LoggingSocketHandler, LoggingUDPSocketHandler +from lapis.monitor import ( + LoggingSocketHandler, + LoggingUDPSocketHandler, + SIMULATION_START, +) from lapis.storageelement import StorageElement +from monitoredpipe import MonitoredPipe + +import time as pytime +from usim import time class MonitoredPipeInfo(NamedTuple): @@ -23,6 +31,36 @@ class HitrateInfo(NamedTuple): provides_file: int +class SimulationInfo(NamedTuple): + input: list + identifier: str + + +def simulation_id(simulationinfo) -> list: + results = [ + { + "input": str(simulationinfo.input), + "id": simulationinfo.identifier, + "time": pytime.ctime(SIMULATION_START), + } + ] + return results + + +simulation_id.name = "simulation_id" +simulation_id.whitelist = (SimulationInfo,) +simulation_id.logging_formatter = { + LoggingSocketHandler.__name__: JsonFormatter(), + # logging.StreamHandler.__name__: JsonFormatter(), + logging.StreamHandler.__name__: LineProtocolFormatter( + tags={"tardis"}, resolution=1 + ), + LoggingUDPSocketHandler.__name__: LineProtocolFormatter( + tags={"tardis"}, resolution=1 + ), +} + + def hitrate_evaluation(hitrateinfo: HitrateInfo) -> list: results = [ { @@ -97,6 +135,7 @@ def pipe_status(pipeinfo: MonitoredPipeInfo) -> list: "no_subscribers": pipeinfo.no_subscriptions, } ] + print(time.now, "monitoring:", results) return results @@ -114,56 +153,26 @@ def pipe_status(pipeinfo: MonitoredPipeInfo) -> list: } -# def storage_connection(storage: StorageElement) -> list: -# """ -# Log information about the storages connection -# :param storage: -# :return: -# """ -# results = [ -# { -# "storage": repr(storage), -# "throughput": storage.connection.throughput, -# "requested_throughput": sum(storage.connection._subscriptions.values()), -# "throughput_scale": storage.connection._throughput_scale, -# } -# ] -# return results -# -# -# storage_connection.name = "storage_connection" -# storage_connection.whitelist = (StorageElement,) -# storage_connection.logging_formatter = { -# LoggingSocketHandler.__name__: JsonFormatter(), -# logging.StreamHandler.__name__: JsonFormatter(), -# LoggingUDPSocketHandler.__name__: LineProtocolFormatter( -# tags={"tardis", "storage"}, resolution=1 -# ), -# } -# -# -# def remote_connection(remote: Pipe) -> list: -# """ -# Log information about the remote connection -# :param remote: -# :return: -# """ -# results = [ -# { -# "throughput": remote.throughput, -# "requested_throughput": sum(remote._subscriptions.values()), -# "throughput_scale": remote._throughput_scale, -# } -# ] -# return results -# -# -# remote_connection.name = "remote_connection" -# remote_connection.whitelist = (Pipe,) -# remote_connection.logging_formatter = { -# LoggingSocketHandler.__name__: JsonFormatter(), -# logging.StreamHandler.__name__: JsonFormatter(), -# LoggingUDPSocketHandler.__name__: LineProtocolFormatter( -# tags={"tardis"}, resolution=1 -# ), -# } +def pipe_data_volume(pipe: MonitoredPipe): + """ + Total amount of data transferred by the pipe up to this point + :param pipe: + :return: + """ + results = [{"pipe": repr(pipe), "current_total": pipe.transferred_data}] + print(results) + return results + + +pipe_data_volume.name = "pipe_data_volume" +pipe_data_volume.whitelist = (MonitoredPipe,) +pipe_data_volume.logging_formatter = { + LoggingSocketHandler.__name__: JsonFormatter(), + # logging.StreamHandler.__name__: JsonFormatter(), + logging.StreamHandler.__name__: LineProtocolFormatter( + tags={"tardis", "pipe"}, resolution=1 + ), + LoggingUDPSocketHandler.__name__: LineProtocolFormatter( + tags={"tardis", "pipe"}, resolution=1 + ), +} diff --git a/lapis/monitor/general.py b/lapis/monitor/general.py index 42e3923..81acdd5 100644 --- a/lapis/monitor/general.py +++ b/lapis/monitor/general.py @@ -77,6 +77,7 @@ def user_demand(job_queue: JobQueue) -> List[Dict]: :param scheduler: the scheduler :return: list of records for logging """ + # print("user_demand", job_queue) result = [{"value": len(job_queue)}] return result @@ -161,6 +162,7 @@ def job_events(job: Job) -> List[Dict]: "pool_type": "drone", "pool": repr(job.drone), "job": repr(job), + "cached": str(job._cached_data), } if job.successful is None: result["queue_time"] = job.queue_date @@ -168,7 +170,15 @@ def job_events(job: Job) -> List[Dict]: result["starting"] = 1 elif job.successful: result["wall_time"] = job.walltime + result["original_walltime"] = job._original_walltime + result["calculation_time"] = job._calculation_time + result["transfer_time"] = job._transfer_time result["success"] = 1 + result["diff"] = job.walltime - job._original_walltime + result["efficiency"] = job.cputime * 1.0 / job.walltime + result["data_througput"] = ( + job._total_input_data / 1000.0 / 1000.0 / 1000.0 / job.walltime + ) else: result["success"] = 0 error_logged = False @@ -196,10 +206,12 @@ def job_events(job: Job) -> List[Dict]: LoggingSocketHandler.__name__: JsonFormatter(), # logging.StreamHandler.__name__: JsonFormatter(), logging.StreamHandler.__name__: LineProtocolFormatter( - tags={"tardis", "pool_configuration", "pool_type", "pool", "job"}, resolution=1 + tags={"tardis", "pool_configuration", "pool_type", "pool", "job", "cached"}, + resolution=1, ), LoggingUDPSocketHandler.__name__: LineProtocolFormatter( - tags={"tardis", "pool_configuration", "pool_type", "pool", "job"}, resolution=1 + tags={"tardis", "pool_configuration", "pool_type", "pool", "job", "cached"}, + resolution=1, ), } @@ -269,6 +281,7 @@ def drone_statistics_caching(drone: Drone) -> List[Dict]: "pool": repr(drone), "claimed_slots": full_resources["cores"] - resources["cores"], "free_slots": resources["cores"], + "slots_with_caching": drone.jobs_with_cached_data, } ] return results diff --git a/lapis/scheduler.py b/lapis/scheduler.py index dda0733..84460e5 100644 --- a/lapis/scheduler.py +++ b/lapis/scheduler.py @@ -18,6 +18,8 @@ from numpy import mean +from usim import time + class JobQueue(list): pass @@ -66,6 +68,8 @@ def access_wrapped(name, requested=True): return (1 / 1024 / 1024) * access_wrapped("memory", requested=True) elif "requestdisk" in item: return (1 / 1024) * access_wrapped("disk", requested=True) + elif "requestwalltime" in item: + return self._wrapped.requested_walltime elif "cpus" in item: try: return self._temp["cores"] @@ -80,12 +84,12 @@ def access_wrapped(name, requested=True): try: return (1 / 1024) * self._temp["disk"] except KeyError: + return (1 / 1024) * access_wrapped("disk", requested=False) elif "cache_demand" in item: caches = self._wrapped.connection.storages.get( self._wrapped.sitename, None ) - try: # print(mean( # [cache.connection._throughput_scale for cache in caches] @@ -119,6 +123,12 @@ def access_wrapped(name, requested=True): # print(self._wrapped, self._wrapped.cached_data / 1000. / 1000. / 1000.) return self._wrapped.cached_data / 1000.0 / 1000.0 / 1000.0 + elif "data_volume" in item: + return self._wrapped._total_input_data + + elif "current_waiting_time" in item: + return time.now - self._wrapped.queue_date + return super(WrappedClassAd, self).__getitem__(item) def clear_temporary_resources(self): @@ -633,6 +643,7 @@ def _match_job( "Requirements", my=drone, target=job ): return drone + job._wrapped.failed_matches += 1 raise NoMatch() async def _schedule_jobs(self): @@ -659,6 +670,13 @@ async def _schedule_jobs(self): - value ) pre_job_drones.update(matched_drone) + if ( + candidate_job._wrapped._total_input_data + and matched_drone._wrapped.cached_data + ): + candidate_job._wrapped._cached_data = ( + matched_drone._wrapped.cached_data + ) if not matches: return # TODO: optimize for few matches, many matches, all matches diff --git a/lapis/simulator.py b/lapis/simulator.py index 4aa50b3..ebbe929 100644 --- a/lapis/simulator.py +++ b/lapis/simulator.py @@ -12,7 +12,13 @@ from lapis.drone import Drone from lapis.job import job_to_queue_scheduler from lapis.connection import Connection -from lapis.monitor.caching import storage_status, pipe_status, hitrate_evaluation +from lapis.monitor.caching import ( + storage_status, + pipe_status, + hitrate_evaluation, + simulation_id, + pipe_data_volume, +) from lapis.monitor.general import ( user_demand, job_statistics, @@ -54,6 +60,8 @@ def enable_monitoring(self): self.monitoring.register_statistic(pipe_status) self.monitoring.register_statistic(drone_statistics_caching) self.monitoring.register_statistic(hitrate_evaluation) + self.monitoring.register_statistic(simulation_id) + self.monitoring.register_statistic(pipe_data_volume) def create_job_generator(self, job_input, job_reader): self._job_generators.append((job_input, job_reader)) @@ -89,6 +97,7 @@ def create_connection_module(self, remote_throughput): def run(self, until=None): monitor.SIMULATION_START = pytime.time() + print(f"[lapis-{monitor.SIMULATION_START}] running until {until}") run(self._simulate(until)) diff --git a/lapis/storageelement.py b/lapis/storageelement.py index 0d3914d..f58c477 100644 --- a/lapis/storageelement.py +++ b/lapis/storageelement.py @@ -2,6 +2,7 @@ from usim import time, Resources, Scope from monitoredpipe import MonitoredPipe +from lapis.monitor import sampling_required from lapis.files import StoredFile, RequestedFile, RequestedFile_HitrateBased from lapis.interfaces._storage import Storage, LookUpInformation @@ -28,6 +29,7 @@ def used(self): async def transfer(self, file: RequestedFile, **kwargs): await self.connection.transfer(total=file.filesize) + await sampling_required.put(self.connection) async def add(self, file: StoredFile, **kwargs): raise NotImplementedError @@ -291,8 +293,11 @@ async def transfer(self, file: RequestedFile_HitrateBased, job_repr=None): # ) if file.cachehitrate: await self.connection.transfer(total=file.filesize) + await sampling_required.put(self.connection) else: - await self.remote_storage.connection.transfer(total=file.filesize) + print("wants to read from remote") + print("file is not cached but cache is file source, this should not occur") + raise ValueError def find(self, requested_file: RequestedFile_HitrateBased, job_repr=None): # return LookUpInformation(requested_file.filesize, self) diff --git a/monitoredpipe.py b/monitoredpipe.py index 50e096c..1dab839 100644 --- a/monitoredpipe.py +++ b/monitoredpipe.py @@ -1,5 +1,6 @@ from usim import Pipe, instant from usim._primitives.notification import Notification +from typing import Optional class MonitoredPipe(Pipe): @@ -7,6 +8,7 @@ def __init__(self, throughput: float): super().__init__(throughput) self._monitor = Notification() self.storage = None + self.transferred_data = 0 async def load(self): """ @@ -19,16 +21,21 @@ async def report_load(pipe: MonitoredPipe): Currently only works for loads exceeding 100%. """ await instant - yield sum(self._subscriptions.values()) while True: await self._monitor + print(time.now, "woke up:", time.now, self, self._subscriptions) yield sum(self._subscriptions.values()) def _throttle_subscribers(self): + print(time.now, "awakening monitors, throttling subscribers") self._monitor.__awake_all__() super()._throttle_subscribers() + async def transfer(self, total: float, throughput: Optional[float] = None) -> None: + await super().transfer(total, throughput) + self.transferred_data += total + def __repr__(self): return "<%s: %s>" % (self.__class__.__name__, self.storage or id(self)) From 13cdd4ba482a1522ca7d40c62e1d484a1a9411ac Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Thu, 20 Feb 2020 16:01:12 +0100 Subject: [PATCH 541/648] changed connection creation to allow for monitoring --- custom_simulate.py | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/custom_simulate.py b/custom_simulate.py index 969d086..0713b97 100644 --- a/custom_simulate.py +++ b/custom_simulate.py @@ -8,6 +8,7 @@ from lapis.connection import Connection from lapis.drone import Drone from lapis.job_io.htcondor import htcondor_job_reader + from lapis.pool import StaticPool from lapis.pool_io.htcondor import htcondor_pool_reader from lapis.job_io.swf import swf_job_reader @@ -76,6 +77,7 @@ def ini_and_run( pre_job_rank=pre_job_rank_defaults, machine_ads=machine_ad_defaults, job_ads=job_ad_defaults, + additional_identifier=None, ): # ini logging to file monitoring_logger = logging.getLogger() @@ -107,8 +109,8 @@ def ini_and_run( print( "scheduler configuration: \n " - "\tpre job rank: {} \n" - "\tmachine classad: {}\n" + "\tpre job rank: {} \n\n" + "\tmachine classad:\n \t{}\n\n" "\tjob classad: {}".format(pre_job_rank, machine_ads, job_ads) ) @@ -119,8 +121,11 @@ def ini_and_run( job_ad=job_ads, ) - simulator.create_connection_module(remote_throughput * 1000 * 1000 * 1000) + simulator.connection = Connection( + remote_throughput * 1000 * 1000 * 1000, filebased_caching=False + ) dummy_pool_connection = Connection(float("Inf")) + print("dummy:", dummy_pool_connection.remote_connection.connection) with open(storage_file, "r") as storage_file: simulator.create_storage( storage_input=storage_file, @@ -162,6 +167,7 @@ def ini_and_run( # job_file = "/home/tabea/work/testdata/hitratebased/job_list_minimal.json" job_file = "/home/tabea/work/testdata/modified/job_list_minimal_only_cpu.json" + # job_file = "/home/tabea/work/testdata/modified/job_list_minimal_only_caching.json" # job_file = "/home/tabea/work/testdata/modified/single_job.json" # job_file = "/home/tabea/work/testdata/modified/week_25_1.0_0.0_16_input.json" # job_file = "/home/tabea/work/testdata/fullsim/test_12h_jobinput.json" @@ -184,13 +190,16 @@ def ini_and_run( "/home/tabea/work/testdata/fullsim/sg_machines_shared_cache.csv", "/home/tabea/work/testdata/fullsim/dummycluster.csv", ] - # pool_files = ["/home/tabea/work/testdata/hitratebased/minimal_pool.csv"] - storage_file = "/home/tabea/work/testdata/fullsim/sg_caches_shared.csv" + # pool_files = ["/home/tabea/work/testdata/fullsim/minimal_pool.csv"] + # pool_files = ["/home/tabea/work/testdata/fullsim/dummycluster.csv"] + # pool_files = ["/home/tabea/work/testdata/fullsim/sg_machines_shared_cache.csv"] + # storage_file = "/home/tabea/work/testdata/fullsim/sg_caches_shared.csv" + storage_file = "/home/tabea/work/testdata/fullsim/minimal_cache.csv" storage_type = "filehitrate" ini_and_run( job_file=job_file, remote_throughput=0.75, - calculation_efficiency=0.99, + calculation_efficiency=0.9, pool_files=pool_files, storage_file=storage_file, storage_type=storage_type, @@ -204,9 +213,7 @@ def ini_and_run( """.strip(), job_ads=""" Requirements = my.requestcpus <= target.cpus && my.requestmemory <= target.memory - Rank = 0 """, + Rank = 0""", ) - # * (target.cache_demand > 0.1) * target.cache_demand * - # target.cache_throughput - # rank = my.pipe_utilization + my.average_throughput + # target.cached_data * (target.cache_demand > 0.1) From a1208a1b5154904f76f749f244073414c01e417d Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Thu, 20 Feb 2020 16:31:48 +0100 Subject: [PATCH 542/648] replaced MonitoredPipe --- monitoredpipe.py | 66 +++++++++++++++++++++++++++++++++++++----------- 1 file changed, 51 insertions(+), 15 deletions(-) diff --git a/monitoredpipe.py b/monitoredpipe.py index 1dab839..e959f31 100644 --- a/monitoredpipe.py +++ b/monitoredpipe.py @@ -1,36 +1,70 @@ -from usim import Pipe, instant +from typing import NamedTuple, Optional, Deque, Any, Dict, AsyncIterable + +from usim import Pipe, instant, time +from collections import deque from usim._primitives.notification import Notification -from typing import Optional + + +class MonitoredPipeInfo(NamedTuple): + requested_throughput: float + available_throughput: float + pipename: Optional[str] + throughputscale: float + no_subscriptions: int class MonitoredPipe(Pipe): def __init__(self, throughput: float): super().__init__(throughput) self._monitor = Notification() + self._monitor_buffers: Dict[Any, Deque[MonitoredPipeInfo]] = {} self.storage = None self.transferred_data = 0 - async def load(self): + async def load(self) -> AsyncIterable[MonitoredPipeInfo]: """ Monitor any changes of the throughput load of the pipe .. code:: python3 async def report_load(pipe: MonitoredPipe): - async for throughput in pipe.load(): - print(f'{time.now:6.0f}: {throughput} \t [{throughput / pipe.throughput * 100:03.0f}%]') - .. note:: - Currently only works for loads exceeding 100%. + async for event in pipe.load(): + print( + f'{time.now:6.0f}:' + f'{event.requested_throughput} \t' + f'[{event.requested_throughput / event.available_throughput * 100:03.0f}%]' + ) """ await instant - yield sum(self._subscriptions.values()) - while True: - await self._monitor - print(time.now, "woke up:", time.now, self, self._subscriptions) - yield sum(self._subscriptions.values()) + yield self._sample_state() + sentinel = object() + self._monitor_buffers[ + sentinel + ] = buffer = deque() # type: Deque[MonitoredPipeInfo] + try: + while True: + while buffer: + yield buffer.popleft() + await self._monitor + finally: + del self._monitor_buffers[sentinel] def _throttle_subscribers(self): print(time.now, "awakening monitors, throttling subscribers") + self._monitor.__awake_all__() super()._throttle_subscribers() + data = self._sample_state() + for buffer in self._monitor_buffers.values(): + print(buffer) + buffer.append(data) + + def _sample_state(self): + return MonitoredPipeInfo( + sum(self._subscriptions.values()), + self.throughput, + repr(self), + self._throughput_scale, + len(self._subscriptions), + ) async def transfer(self, total: float, throughput: Optional[float] = None) -> None: await super().transfer(total, throughput) @@ -41,12 +75,14 @@ def __repr__(self): if __name__ == "__main__": - from usim import time, run, Scope + from usim import run, Scope async def report_load(pipe: MonitoredPipe): - async for throughput in pipe.load(): + async for event in pipe.load(): print( - f"{time.now:6.0f}: {throughput} \t [{throughput / pipe.throughput * 100:03.0f}%]" + f"{time.now:6.0f}:" + f"{event.requested_throughput} \t" + f"[{event.requested_throughput / event.available_throughput * 100:03.0f}%]" ) async def perform_load(pipe: MonitoredPipe, delay, amount): From 5e36c01f05fc2a84bcb4974c63f51f760bdfd66e Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Thu, 20 Feb 2020 16:33:03 +0100 Subject: [PATCH 543/648] old monitoredpipe --- monitoredpipe_old.py | 63 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 63 insertions(+) create mode 100644 monitoredpipe_old.py diff --git a/monitoredpipe_old.py b/monitoredpipe_old.py new file mode 100644 index 0000000..13bc5ef --- /dev/null +++ b/monitoredpipe_old.py @@ -0,0 +1,63 @@ +from usim import Pipe, instant +from usim._primitives.notification import Notification +from typing import Optional + + +class MonitoredPipe(Pipe): + def __init__(self, throughput: float): + super().__init__(throughput) + self._monitor = Notification() + self.storage = None + self.transferred_data = 0 + + async def load(self): + """ + Monitor any changes of the throughput load of the pipe + .. code:: python3 + async def report_load(pipe: MonitoredPipe): + async for throughput in pipe.load(): + print(f'{time.now:6.0f}: {throughput} \t [{throughput / pipe.throughput * 100:03.0f}%]') + .. note:: + Currently only works for loads exceeding 100%. + """ + await instant + yield sum(self._subscriptions.values()) + while True: + await self._monitor + yield sum(self._subscriptions.values()) + + def _throttle_subscribers(self): + self._monitor.__awake_all__() + super()._throttle_subscribers() + + async def transfer(self, total: float, throughput: Optional[float] = None) -> None: + await super().transfer(total, throughput) + self.transferred_data += total + + def __repr__(self): + return "<%s: %s>" % (self.__class__.__name__, self.storage or id(self)) + + +if __name__ == "__main__": + from usim import time, run, Scope + + async def report_load(pipe: MonitoredPipe): + async for throughput in pipe.load(): + print( + f"{time.now:6.0f}: {throughput} \t [{throughput / pipe.throughput * 100:03.0f}%]" + ) + + async def perform_load(pipe: MonitoredPipe, delay, amount): + await (time + delay) + await pipe.transfer(amount, pipe.throughput / 2) + + async def main(): + pipe = MonitoredPipe(128) + async with Scope() as scope: + scope.do(report_load(pipe), volatile=True) + scope.do(perform_load(pipe, 0, 512)) + scope.do(perform_load(pipe, 4, 1024)) + scope.do(perform_load(pipe, 6, 128)) + scope.do(perform_load(pipe, 12, 1024)) + + run(main()) From 34c9ec033ec42ba34ce878f50db2c39c1f6ef6c9 Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Thu, 20 Feb 2020 16:34:04 +0100 Subject: [PATCH 544/648] adapted to new monitoredpipe --- lapis/connection.py | 13 +++---------- lapis/monitor/caching.py | 14 ++++---------- 2 files changed, 7 insertions(+), 20 deletions(-) diff --git a/lapis/connection.py b/lapis/connection.py index d6c68d6..f59e08d 100644 --- a/lapis/connection.py +++ b/lapis/connection.py @@ -13,7 +13,7 @@ from lapis.storageelement import StorageElement, RemoteStorage from lapis.files import RequestedFile, RequestedFile_HitrateBased from lapis.monitor import sampling_required -from lapis.monitor.caching import MonitoredPipeInfo, HitrateInfo +from lapis.monitor.caching import HitrateInfo class Connection(object): @@ -48,15 +48,8 @@ async def report_load_to_monitoring(pipe: MonitoredPipe): pipe._subscriptions, pipe._throughput_scale, ) - await sampling_required.put( - MonitoredPipeInfo( - throughput, - pipe.throughput, - repr(pipe), - pipe._throughput_scale, - len(pipe._subscriptions), - ) - ) + print(throughput) + await sampling_required.put(throughput) # print( # f"{time.now:6.0f}: {throughput} \t [{throughput / pipe.throughput * 100:03.0f}%]" # ) diff --git a/lapis/monitor/caching.py b/lapis/monitor/caching.py index eb73731..e908279 100644 --- a/lapis/monitor/caching.py +++ b/lapis/monitor/caching.py @@ -1,6 +1,6 @@ import logging -from typing import NamedTuple, Optional +from typing import NamedTuple from cobald.monitor.format_json import JsonFormatter from cobald.monitor.format_line import LineProtocolFormatter @@ -11,20 +11,12 @@ SIMULATION_START, ) from lapis.storageelement import StorageElement -from monitoredpipe import MonitoredPipe +from monitoredpipe import MonitoredPipe, MonitoredPipeInfo import time as pytime from usim import time -class MonitoredPipeInfo(NamedTuple): - requested_throughput: float - available_throughput: float - pipename: Optional[str] - throughputscale: float - no_subscriptions: int - - class HitrateInfo(NamedTuple): hitrate: float volume: float @@ -123,6 +115,8 @@ def pipe_status(pipeinfo: MonitoredPipeInfo) -> list: # :param storage: # :return: # """ + # print(pipeinfo) + # print(pipeinfo.requested_throughput) results = [ { "pipe": repr(pipeinfo.pipename), From 76a6d4326e4d28a68064ca397f1731434dd03a15 Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Thu, 20 Feb 2020 22:56:25 +0100 Subject: [PATCH 545/648] changed monitoring resolution to ns --- lapis/monitor/__init__.py | 4 +++- lapis/monitor/caching.py | 16 ++++++++++------ lapis/monitor/cobald.py | 8 ++++---- lapis/monitor/general.py | 28 ++++++++++++++-------------- 4 files changed, 31 insertions(+), 25 deletions(-) diff --git a/lapis/monitor/__init__.py b/lapis/monitor/__init__.py index 47feb59..7e7b1c5 100644 --- a/lapis/monitor/__init__.py +++ b/lapis/monitor/__init__.py @@ -5,6 +5,7 @@ from cobald.monitor.format_json import JsonFormatter from usim import time, Queue +from usim._core.loop import __LOOP_STATE__ SIMULATION_START = None @@ -26,7 +27,8 @@ class SimulationTimeFilter(logging.Filter): """ def filter(self, record) -> bool: - record.created = time.now + # record.created = time.now + record.created = time.now + (1e-9 * __LOOP_STATE__.LOOP.turn) return True diff --git a/lapis/monitor/caching.py b/lapis/monitor/caching.py index e908279..945a7d6 100644 --- a/lapis/monitor/caching.py +++ b/lapis/monitor/caching.py @@ -139,10 +139,10 @@ def pipe_status(pipeinfo: MonitoredPipeInfo) -> list: LoggingSocketHandler.__name__: JsonFormatter(), # logging.StreamHandler.__name__: JsonFormatter(), logging.StreamHandler.__name__: LineProtocolFormatter( - tags={"tardis", "pipe"}, resolution=1 + tags={"tardis", "pipe"}, resolution=1e-9 ), LoggingUDPSocketHandler.__name__: LineProtocolFormatter( - tags={"tardis", "pipe"}, resolution=1 + tags={"tardis", "pipe"}, resolution=1e-9 ), } @@ -153,8 +153,12 @@ def pipe_data_volume(pipe: MonitoredPipe): :param pipe: :return: """ - results = [{"pipe": repr(pipe), "current_total": pipe.transferred_data}] - print(results) + results = [ + { + "pipe": repr(pipe), + "current_total": pipe.transferred_data / 1000.0 / 1000.0 / 1000.0, + } + ] return results @@ -164,9 +168,9 @@ def pipe_data_volume(pipe: MonitoredPipe): LoggingSocketHandler.__name__: JsonFormatter(), # logging.StreamHandler.__name__: JsonFormatter(), logging.StreamHandler.__name__: LineProtocolFormatter( - tags={"tardis", "pipe"}, resolution=1 + tags={"tardis", "pipe"}, resolution=1e-9 ), LoggingUDPSocketHandler.__name__: LineProtocolFormatter( - tags={"tardis", "pipe"}, resolution=1 + tags={"tardis", "pipe"}, resolution=1e-9 ), } diff --git a/lapis/monitor/cobald.py b/lapis/monitor/cobald.py index 710ee0c..1380651 100644 --- a/lapis/monitor/cobald.py +++ b/lapis/monitor/cobald.py @@ -37,10 +37,10 @@ def drone_statistics(drone: Drone) -> List[Dict]: LoggingSocketHandler.__name__: JsonFormatter(), # logging.StreamHandler.__name__: JsonFormatter(), logging.StreamHandler.__name__: LineProtocolFormatter( - tags={"tardis", "pool_configuration", "pool_type", "pool"}, resolution=1 + tags={"tardis", "pool_configuration", "pool_type", "pool"}, resolution=1e-9 ), LoggingUDPSocketHandler.__name__: LineProtocolFormatter( - tags={"tardis", "pool_configuration", "pool_type", "pool"}, resolution=1 + tags={"tardis", "pool_configuration", "pool_type", "pool"}, resolution=1e-9 ), } @@ -72,9 +72,9 @@ def pool_statistics(pool: Pool) -> List[Dict]: LoggingSocketHandler.__name__: JsonFormatter(), # logging.StreamHandler.__name__: JsonFormatter(), logging.StreamHandler.__name__: LineProtocolFormatter( - tags={"tardis", "pool_configuration", "pool_type", "pool"}, resolution=1 + tags={"tardis", "pool_configuration", "pool_type", "pool"}, resolution=1e-9 ), LoggingUDPSocketHandler.__name__: LineProtocolFormatter( - tags={"tardis", "pool_configuration", "pool_type", "pool"}, resolution=1 + tags={"tardis", "pool_configuration", "pool_type", "pool"}, resolution=1e-9 ), } diff --git a/lapis/monitor/general.py b/lapis/monitor/general.py index 81acdd5..792932d 100644 --- a/lapis/monitor/general.py +++ b/lapis/monitor/general.py @@ -61,11 +61,11 @@ def resource_statistics(drone: Drone) -> List[Dict]: # logging.StreamHandler.__name__: JsonFormatter(), logging.StreamHandler.__name__: LineProtocolFormatter( tags={"tardis", "resource_type", "pool_configuration", "pool_type", "pool"}, - resolution=1, + resolution=1e-9, ), LoggingUDPSocketHandler.__name__: LineProtocolFormatter( tags={"tardis", "resource_type", "pool_configuration", "pool_type", "pool"}, - resolution=1, + resolution=1e-9, ), } @@ -88,10 +88,10 @@ def user_demand(job_queue: JobQueue) -> List[Dict]: LoggingSocketHandler.__name__: JsonFormatter(), # logging.StreamHandler.__name__: JsonFormatter(), logging.StreamHandler.__name__: LineProtocolFormatter( - tags={"tardis"}, resolution=1 + tags={"tardis"}, resolution=1e-9 ), LoggingUDPSocketHandler.__name__: LineProtocolFormatter( - tags={"tardis"}, resolution=1 + tags={"tardis"}, resolution=1e-9 ), } @@ -128,10 +128,10 @@ def job_statistics(scheduler: CondorJobScheduler) -> List[Dict]: LoggingSocketHandler.__name__: JsonFormatter(), # logging.StreamHandler.__name__: JsonFormatter(), logging.StreamHandler.__name__: LineProtocolFormatter( - tags={"tardis", "pool_configuration", "pool_type", "pool"}, resolution=1 + tags={"tardis", "pool_configuration", "pool_type", "pool"}, resolution=1e-9 ), LoggingUDPSocketHandler.__name__: LineProtocolFormatter( - tags={"tardis", "pool_configuration", "pool_type", "pool"}, resolution=1 + tags={"tardis", "pool_configuration", "pool_type", "pool"}, resolution=1e-9 ), } @@ -207,11 +207,11 @@ def job_events(job: Job) -> List[Dict]: # logging.StreamHandler.__name__: JsonFormatter(), logging.StreamHandler.__name__: LineProtocolFormatter( tags={"tardis", "pool_configuration", "pool_type", "pool", "job", "cached"}, - resolution=1, + resolution=1e-9, ), LoggingUDPSocketHandler.__name__: LineProtocolFormatter( tags={"tardis", "pool_configuration", "pool_type", "pool", "job", "cached"}, - resolution=1, + resolution=1e-9, ), } @@ -233,11 +233,11 @@ def pool_status(pool: Pool) -> List[Dict]: # logging.StreamHandler.__name__: JsonFormatter(), logging.StreamHandler.__name__: LineProtocolFormatter( tags={"tardis", "parent_pool", "pool_configuration", "pool_type", "pool"}, - resolution=1, + resolution=1e-9, ), LoggingUDPSocketHandler.__name__: LineProtocolFormatter( tags={"tardis", "parent_pool", "pool_configuration", "pool_type", "pool"}, - resolution=1, + resolution=1e-9, ), } @@ -257,10 +257,10 @@ def configuration_information(simulator: "Simulator") -> List[Dict]: LoggingSocketHandler.__name__: JsonFormatter(), # logging.StreamHandler.__name__: JsonFormatter(), logging.StreamHandler.__name__: LineProtocolFormatter( - tags={"tardis", "pool_configuration", "resource_type"}, resolution=1 + tags={"tardis", "pool_configuration", "resource_type"}, resolution=1e-9 ), LoggingUDPSocketHandler.__name__: LineProtocolFormatter( - tags={"tardis", "pool_configuration", "resource_type"}, resolution=1 + tags={"tardis", "pool_configuration", "resource_type"}, resolution=1e-9 ), } @@ -293,9 +293,9 @@ def drone_statistics_caching(drone: Drone) -> List[Dict]: LoggingSocketHandler.__name__: JsonFormatter(), # logging.StreamHandler.__name__: JsonFormatter(), logging.StreamHandler.__name__: LineProtocolFormatter( - tags={"tardis", "pool_type", "pool"}, resolution=1 + tags={"tardis", "pool_type", "pool"}, resolution=1e-9 ), LoggingUDPSocketHandler.__name__: LineProtocolFormatter( - tags={"tardis", "pool_type", "pool"}, resolution=1 + tags={"tardis", "pool_type", "pool"}, resolution=1e-9 ), } From 07dbe74ab57fa004431379e65ec108bfa72a1530 Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Thu, 20 Feb 2020 22:57:43 +0100 Subject: [PATCH 546/648] adapted monitored pipe monitoring --- lapis/connection.py | 17 +++-------------- monitoredpipe.py | 1 - 2 files changed, 3 insertions(+), 15 deletions(-) diff --git a/lapis/connection.py b/lapis/connection.py index f59e08d..b337fff 100644 --- a/lapis/connection.py +++ b/lapis/connection.py @@ -39,20 +39,9 @@ def __init__(self, throughput=1000 * 1000 * 1000, filebased_caching=True): async def run_pipemonitoring(self): async def report_load_to_monitoring(pipe: MonitoredPipe): - async for throughput in pipe.load(): - print( - time.now, - "registered change, sending to monitoring", - time.now, - pipe, - pipe._subscriptions, - pipe._throughput_scale, - ) - print(throughput) - await sampling_required.put(throughput) - # print( - # f"{time.now:6.0f}: {throughput} \t [{throughput / pipe.throughput * 100:03.0f}%]" - # ) + async for information in pipe.load(): + # print(information) + await sampling_required.put(information) async with Scope() as scope: scope.do(report_load_to_monitoring(self.remote_connection.connection)) diff --git a/monitoredpipe.py b/monitoredpipe.py index e959f31..05fa5c2 100644 --- a/monitoredpipe.py +++ b/monitoredpipe.py @@ -54,7 +54,6 @@ def _throttle_subscribers(self): super()._throttle_subscribers() data = self._sample_state() for buffer in self._monitor_buffers.values(): - print(buffer) buffer.append(data) def _sample_state(self): From 32e3405e6bfa688955808fcec9d38f78f415cd90 Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Fri, 21 Feb 2020 07:05:12 +0100 Subject: [PATCH 547/648] added classad --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 6f34b1c..bb3662b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,7 +25,7 @@ classifiers = [ 'Programming Language :: Python :: 3.7' ] -requires = ["cobald", "usim >= 0.4.3", "click"] +requires = ["cobald", "usim >= 0.4.3", "click", "classad"] [tool.flit.metadata.requires-extra] test = [ From d3c4e43065901c2e683bb97c0171e6ced1d2b5e6 Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Fri, 21 Feb 2020 09:14:23 +0100 Subject: [PATCH 548/648] debug --- lapis/scheduler.py | 1 + 1 file changed, 1 insertion(+) diff --git a/lapis/scheduler.py b/lapis/scheduler.py index 84460e5..f563bad 100644 --- a/lapis/scheduler.py +++ b/lapis/scheduler.py @@ -50,6 +50,7 @@ class WrappedClassAd(ClassAd, Generic[DJ]): def __init__(self, classad: ClassAd, wrapped: DJ): super(WrappedClassAd, self).__init__() self._wrapped = wrapped + print(classad, type(classad), repr(classad)) self._data = classad._data self._temp = {} From 2085d222a064ee4b2d88f52e36f88b667770391b Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Fri, 21 Feb 2020 17:10:46 +0100 Subject: [PATCH 549/648] extended monitoring and removed debug output --- lapis/drone.py | 36 ++++++++++++++++ lapis/monitor/duplicates.py | 77 ++++++++++++++++++++++++++++++++++ lapis/scheduler.py | 82 ++++++++++++++++++++++++++++--------- lapis/simulator.py | 3 ++ monitoredpipe.py | 2 +- 5 files changed, 179 insertions(+), 21 deletions(-) create mode 100644 lapis/monitor/duplicates.py diff --git a/lapis/drone.py b/lapis/drone.py index 5c3ca64..8e025a5 100644 --- a/lapis/drone.py +++ b/lapis/drone.py @@ -6,6 +6,8 @@ from lapis.job import Job from lapis.connection import Connection +from lapis.monitor.duplicates import DroneStatusCaching + class ResourcesExceeded(Exception): ... @@ -68,6 +70,14 @@ async def run(self): await (time + self.scheduling_duration) self._supply = 1 self.scheduler.register_drone(self) + await sampling_required.put( + DroneStatusCaching( + repr(self), + self.pool_resources["cores"], + self.theoretical_available_resources["cores"], + self.jobs_with_cached_data, + ) + ) await sampling_required.put(self) async with Scope() as scope: async for job, kill in self._job_queue: @@ -118,7 +128,16 @@ async def shutdown(self): self._supply = 0 self.scheduler.unregister_drone(self) + await sampling_required.put( + DroneStatusCaching( + repr(self), + self.pool_resources["cores"], + self.theoretical_available_resources["cores"], + self.jobs_with_cached_data, + ) + ) await sampling_required.put(self) # TODO: introduce state of drone + await (time + 1) async def schedule_job(self, job: Job, kill: bool = False): @@ -147,6 +166,14 @@ async def _run_job(self, job: Job, kill: bool): self.jobs_with_cached_data += 1 try: async with self.resources.claim(**job.resources): + await sampling_required.put( + DroneStatusCaching( + repr(self), + self.pool_resources["cores"], + self.theoretical_available_resources["cores"], + self.jobs_with_cached_data, + ) + ) await sampling_required.put(self) if kill: for resource_key in job.resources: @@ -180,10 +207,19 @@ async def _run_job(self, job: Job, kill: bool): self.jobs -= 1 if job._cached_data: self.jobs_with_cached_data -= 1 + await self.scheduler.job_finished(job) self._utilisation = self._allocation = None self.scheduler.update_drone(self) await sampling_required.put(self) + await sampling_required.put( + DroneStatusCaching( + repr(self), + self.pool_resources["cores"], + self.theoretical_available_resources["cores"], + self.jobs_with_cached_data, + ) + ) def look_up_cached_data(self, job: Job): cached_data = 0 diff --git a/lapis/monitor/duplicates.py b/lapis/monitor/duplicates.py new file mode 100644 index 0000000..a599670 --- /dev/null +++ b/lapis/monitor/duplicates.py @@ -0,0 +1,77 @@ +import logging.handlers +from typing import NamedTuple, List, Dict +from lapis.monitor import LoggingSocketHandler, LoggingUDPSocketHandler + +from cobald.monitor.format_json import JsonFormatter +from cobald.monitor.format_line import LineProtocolFormatter + + +class UserDemand(NamedTuple): + value: int + + +class DroneStatusCaching(NamedTuple): + drone: str + slots_tot: int + slots_free: int + slots_caching: int + + +def user_demand_tmp(user_demand: UserDemand) -> List[Dict]: + """ + Log global user demand. + + :param scheduler: the scheduler + :return: list of records for logging + """ + # print("user_demand", job_queue) + result = [{"value": user_demand.value}] + return result + + +user_demand_tmp.name = "user_demand_tmp" +user_demand_tmp.whitelist = (UserDemand,) +user_demand_tmp.logging_formatter = { + LoggingSocketHandler.__name__: JsonFormatter(), + # logging.StreamHandler.__name__: JsonFormatter(), + logging.StreamHandler.__name__: LineProtocolFormatter( + tags={"tardis"}, resolution=1e-9 + ), + LoggingUDPSocketHandler.__name__: LineProtocolFormatter( + tags={"tardis"}, resolution=1e-9 + ), +} + + +def drone_statistics_caching_tmp(dronestatus: DroneStatusCaching) -> List[Dict]: + """ + + + :param drone: the drone + :return: list of records for logging + """ + + results = [ + { + "pool_type": "drone", + "pool": dronestatus.drone, + "claimed_slots": dronestatus.slots_tot - dronestatus.slots_free, + "free_slots": dronestatus.slots_free, + "slots_with_caching": dronestatus.slots_caching, + } + ] + return results + + +drone_statistics_caching_tmp.name = "drone_status_caching_tmp" +drone_statistics_caching_tmp.whitelist = (DroneStatusCaching,) +drone_statistics_caching_tmp.logging_formatter = { + LoggingSocketHandler.__name__: JsonFormatter(), + # logging.StreamHandler.__name__: JsonFormatter(), + logging.StreamHandler.__name__: LineProtocolFormatter( + tags={"tardis", "pool_type", "pool"}, resolution=1e-9 + ), + LoggingUDPSocketHandler.__name__: LineProtocolFormatter( + tags={"tardis", "pool_type", "pool"}, resolution=1e-9 + ), +} diff --git a/lapis/scheduler.py b/lapis/scheduler.py index f563bad..26d0653 100644 --- a/lapis/scheduler.py +++ b/lapis/scheduler.py @@ -15,6 +15,7 @@ from lapis.drone import Drone from lapis.job import Job from lapis.monitor import sampling_required +from lapis.monitor.duplicates import UserDemand from numpy import mean @@ -50,7 +51,6 @@ class WrappedClassAd(ClassAd, Generic[DJ]): def __init__(self, classad: ClassAd, wrapped: DJ): super(WrappedClassAd, self).__init__() self._wrapped = wrapped - print(classad, type(classad), repr(classad)) self._data = classad._data self._temp = {} @@ -63,34 +63,50 @@ def access_wrapped(name, requested=True): return self._wrapped.used_resources[name] if "target" not in item: - if "requestcpus" in item: + if "requestcpus" == item: return access_wrapped("cores", requested=True) - elif "requestmemory" in item: + elif "requestmemory" == item: return (1 / 1024 / 1024) * access_wrapped("memory", requested=True) - elif "requestdisk" in item: + elif "requestdisk" == item: return (1 / 1024) * access_wrapped("disk", requested=True) - elif "requestwalltime" in item: + elif "requestwalltime" == item: return self._wrapped.requested_walltime - elif "cpus" in item: + elif "cpus" == item: try: return self._temp["cores"] except KeyError: return access_wrapped("cores", requested=False) - elif "memory" in item: + elif "memory" == item: try: return (1 / 1000 / 1000) * self._temp["memory"] except KeyError: return (1 / 1000 / 1000) * access_wrapped("memory", requested=False) - elif "disk" in item: + elif "disk" == item: try: return (1 / 1024) * self._temp["disk"] except KeyError: return (1 / 1024) * access_wrapped("disk", requested=False) - elif "cache_demand" in item: + elif "cache_demand" == item: caches = self._wrapped.connection.storages.get( self._wrapped.sitename, None ) + # print(caches) + try: + # print(mean( + # [1. / cache.connection._throughput_scale for cache in caches] + # )) + return mean( + [1.0 / cache.connection._throughput_scale for cache in caches] + ) + except TypeError: + return 0 + + elif "cache_scale" == item: + caches = self._wrapped.connection.storages.get( + self._wrapped.sitename, None + ) + # print(caches) try: # print(mean( # [cache.connection._throughput_scale for cache in caches] @@ -100,7 +116,8 @@ def access_wrapped(name, requested=True): ) except TypeError: return 0 - elif "cache_throughput_per_core" in item: + + elif "cache_throughput_per_core" == item: caches = self._wrapped.connection.storages.get( self._wrapped.sitename, None ) @@ -110,26 +127,35 @@ def access_wrapped(name, requested=True): # [cache.connection.throughput / 1000. / 1000. / 1000. for cache # in # caches] - # ) / float(access_wrapped("cores"))) + # ) / float(self._wrapped.pool_resources["cores"])) return sum( [ cache.connection.throughput / 1000.0 / 1000.0 / 1000.0 for cache in caches ] - ) / float(access_wrapped("cores")) + ) / float(self._wrapped.pool_resources["cores"]) except TypeError: return 0 - elif "cached_data" in item: + elif "cached_data" == item: # print(self._wrapped, self._wrapped.cached_data / 1000. / 1000. / 1000.) return self._wrapped.cached_data / 1000.0 / 1000.0 / 1000.0 - elif "data_volume" in item: - return self._wrapped._total_input_data + elif "data_volume" == item: + return self._wrapped._total_input_data / 1000.0 / 1000.0 / 1000.0 - elif "current_waiting_time" in item: + elif "current_waiting_time" == item: return time.now - self._wrapped.queue_date + elif "failed_matches" == item: + # print("evaluated", self._wrapped, self._wrapped.failed_matches) + return self._wrapped.failed_matches + + elif "jobs_with_cached_data" == item: + # print(self._wrapped) + # print(self._wrapped.jobs_with_cached_data) + return self._wrapped.jobs_with_cached_data + return super(WrappedClassAd, self).__getitem__(item) def clear_temporary_resources(self): @@ -270,6 +296,7 @@ async def run(self): await best_match.schedule_job(job) self.job_queue.remove(job) await sampling_required.put(self.job_queue) + await sampling_required.put(UserDemand(len(self.job_queue))) self.unregister_drone(best_match) left_resources = best_match.theoretical_available_resources left_resources = { @@ -291,6 +318,7 @@ async def _collect_jobs(self): await self._processing.increase(jobs=1) # TODO: logging happens with each job await sampling_required.put(self.job_queue) + await sampling_required.put(UserDemand(len(self.job_queue))) self._collecting = False async def job_finished(self, job): @@ -518,6 +546,7 @@ def items(self) -> Iterator[Tuple[float, Set[WrappedClassAd[DJ]]]]: return iter(self._clusters.items()) def cluster_groups(self) -> Iterator[List[Set[WrappedClassAd[Drone]]]]: + # print(self._clusters.items()) for ranked_key, drones in self._clusters.items(): yield [{item} for item in drones] @@ -610,12 +639,21 @@ async def run(self): def _match_job( job: ClassAd, pre_job_clusters: Iterator[List[Set[WrappedClassAd[Drone]]]] ): + def debug_evaluate(expr, my, target=None): + if type(expr) is str: + expr = my[expr] + result = expr.evaluate(my=my, target=target) + # print(f'>>> {expr}, {my}, {target}\n... {result}') + return result + if job["Requirements"] != Undefined(): pre_job_clusters_tmp = [] for cluster_group in pre_job_clusters: cluster_group_tmp = [] for cluster in cluster_group: - if job.evaluate("Requirements", my=job, target=next(iter(cluster))): + if debug_evaluate( + "Requirements", my=job, target=next(iter(cluster)) + ): cluster_group_tmp.append(cluster) pre_job_clusters_tmp.append(cluster_group_tmp) pre_job_clusters = pre_job_clusters_tmp @@ -627,13 +665,12 @@ def _match_job( sorted( cluster_group, key=lambda cluster: ( - job.evaluate("Rank", my=job, target=next(iter(cluster))), + debug_evaluate("Rank", my=job, target=next(iter(cluster))), random.random(), ), reverse=True, ) ) - pre_job_clusters = pre_job_clusters_tmp for cluster_group in pre_job_clusters: @@ -644,7 +681,7 @@ def _match_job( "Requirements", my=drone, target=job ): return drone - job._wrapped.failed_matches += 1 + raise NoMatch() async def _schedule_jobs(self): @@ -654,11 +691,14 @@ async def _schedule_jobs(self): matches: List[Tuple[int, WrappedClassAd[Job], WrappedClassAd[Drone]]] = [] for queue_index, candidate_job in enumerate(self.job_queue): try: + # print(time.now, candidate_job._wrapped, + # candidate_job._wrapped.requested_inputfiles) pre_job_drones.lookup(candidate_job._wrapped) matched_drone = self._match_job( candidate_job, pre_job_drones.cluster_groups() ) except NoMatch: + candidate_job._wrapped.failed_matches += 1 continue else: matches.append((queue_index, candidate_job, matched_drone)) @@ -689,6 +729,7 @@ async def _schedule_jobs(self): await sampling_required.put(self) # NOTE: Is this correct? Triggers once instead of for each job await sampling_required.put(self.job_queue) + await sampling_required.put(UserDemand(len(self.job_queue))) async def _execute_job(self, job: WrappedClassAd, drone: WrappedClassAd): wrapped_job = job._wrapped @@ -704,6 +745,7 @@ async def _collect_jobs(self): # TODO: logging happens with each job # TODO: job queue to the outside now contains wrapped classads... await sampling_required.put(self.job_queue) + await sampling_required.put(UserDemand(len(self.job_queue))) self._collecting = False async def job_finished(self, job): diff --git a/lapis/simulator.py b/lapis/simulator.py index ebbe929..20a3142 100644 --- a/lapis/simulator.py +++ b/lapis/simulator.py @@ -28,6 +28,7 @@ job_events, drone_statistics_caching, ) +from lapis.monitor.duplicates import user_demand_tmp, drone_statistics_caching_tmp from lapis.monitor.cobald import drone_statistics, pool_statistics from lapis.pool import Pool @@ -62,6 +63,8 @@ def enable_monitoring(self): self.monitoring.register_statistic(hitrate_evaluation) self.monitoring.register_statistic(simulation_id) self.monitoring.register_statistic(pipe_data_volume) + self.monitoring.register_statistic(user_demand_tmp) + self.monitoring.register_statistic(drone_statistics_caching_tmp) def create_job_generator(self, job_input, job_reader): self._job_generators.append((job_input, job_reader)) diff --git a/monitoredpipe.py b/monitoredpipe.py index 05fa5c2..e663a94 100644 --- a/monitoredpipe.py +++ b/monitoredpipe.py @@ -48,7 +48,7 @@ async def report_load(pipe: MonitoredPipe): del self._monitor_buffers[sentinel] def _throttle_subscribers(self): - print(time.now, "awakening monitors, throttling subscribers") + # print(time.now, "awakening monitors, throttling subscribers") self._monitor.__awake_all__() super()._throttle_subscribers() From c6e3766a9b91be6d7d36ff6786e21ecaca27c984 Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Fri, 21 Feb 2020 17:16:20 +0100 Subject: [PATCH 550/648] removed debug output --- lapis/monitor/caching.py | 1 - 1 file changed, 1 deletion(-) diff --git a/lapis/monitor/caching.py b/lapis/monitor/caching.py index 945a7d6..e03fac5 100644 --- a/lapis/monitor/caching.py +++ b/lapis/monitor/caching.py @@ -129,7 +129,6 @@ def pipe_status(pipeinfo: MonitoredPipeInfo) -> list: "no_subscribers": pipeinfo.no_subscriptions, } ] - print(time.now, "monitoring:", results) return results From 262f71fe891b7e9fcd542525bc3483dd6d978ea6 Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Sun, 23 Feb 2020 17:40:01 +0100 Subject: [PATCH 551/648] adjusted resolution and extended job event metric --- lapis/monitor/caching.py | 12 ++++++------ lapis/monitor/general.py | 1 + 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/lapis/monitor/caching.py b/lapis/monitor/caching.py index e03fac5..1de850f 100644 --- a/lapis/monitor/caching.py +++ b/lapis/monitor/caching.py @@ -45,10 +45,10 @@ def simulation_id(simulationinfo) -> list: LoggingSocketHandler.__name__: JsonFormatter(), # logging.StreamHandler.__name__: JsonFormatter(), logging.StreamHandler.__name__: LineProtocolFormatter( - tags={"tardis"}, resolution=1 + tags={"tardis"}, resolution=1e-9 ), LoggingUDPSocketHandler.__name__: LineProtocolFormatter( - tags={"tardis"}, resolution=1 + tags={"tardis"}, resolution=1e-9 ), } @@ -70,10 +70,10 @@ def hitrate_evaluation(hitrateinfo: HitrateInfo) -> list: LoggingSocketHandler.__name__: JsonFormatter(), # logging.StreamHandler.__name__: JsonFormatter(), logging.StreamHandler.__name__: LineProtocolFormatter( - tags={"tardis"}, resolution=1 + tags={"tardis"}, resolution=1e-9 ), LoggingUDPSocketHandler.__name__: LineProtocolFormatter( - tags={"tardis"}, resolution=1 + tags={"tardis"}, resolution=1e-9 ), } @@ -101,10 +101,10 @@ def storage_status(storage: StorageElement) -> list: LoggingSocketHandler.__name__: JsonFormatter(), # logging.StreamHandler.__name__: JsonFormatter(), logging.StreamHandler.__name__: LineProtocolFormatter( - tags={"tardis", "storage"}, resolution=1 + tags={"tardis", "storage"}, resolution=1e-9 ), LoggingUDPSocketHandler.__name__: LineProtocolFormatter( - tags={"tardis", "storage"}, resolution=1 + tags={"tardis", "storage"}, resolution=1e-9 ), } diff --git a/lapis/monitor/general.py b/lapis/monitor/general.py index 792932d..668fb77 100644 --- a/lapis/monitor/general.py +++ b/lapis/monitor/general.py @@ -176,6 +176,7 @@ def job_events(job: Job) -> List[Dict]: result["success"] = 1 result["diff"] = job.walltime - job._original_walltime result["efficiency"] = job.cputime * 1.0 / job.walltime + result["read_from_cache"] = job._read_from_cache result["data_througput"] = ( job._total_input_data / 1000.0 / 1000.0 / 1000.0 / job.walltime ) From 296a316b252af6d526fea08236eafac19550c2dd Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Sun, 23 Feb 2020 17:40:49 +0100 Subject: [PATCH 552/648] jobs keep drones after finishing for monitoring purposes --- lapis/job.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lapis/job.py b/lapis/job.py index 308edd9..8c0f1c2 100644 --- a/lapis/job.py +++ b/lapis/job.py @@ -179,11 +179,11 @@ async def run(self, drone: "Drone"): scope.do(self._transfer_inputfiles()) scope.do(self._calculate()) except CancelTask: - self.drone = None + # self.drone = None self._success = False # TODO: in_queue_until is still set except BaseException: - self.drone = None + # self.drone = None self._success = False # TODO: in_queue_until is still set raise @@ -191,7 +191,7 @@ async def run(self, drone: "Drone"): # old_walltime = self.walltime self.walltime = time.now - start # print(f"monitored walltime of {old_walltime} changed to {self.walltime}") - self.drone = None + # self.drone = None self._success = True await sampling_required.put(self) From 9c365cc724551bc26565fb16078d56cdb8de9e87 Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Mon, 24 Feb 2020 11:40:24 +0100 Subject: [PATCH 553/648] adjusted cache hit monitoring --- lapis/connection.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/lapis/connection.py b/lapis/connection.py index b337fff..32d3264 100644 --- a/lapis/connection.py +++ b/lapis/connection.py @@ -153,7 +153,9 @@ async def transfer_files(self, drone, requested_files: dict, job_repr): # decision if a jobs inputfiles are cached based on hitrate random_inputfile_information = next(iter(requested_files.values())) + # print(time.now, job_repr, requested_files, drone.sitename) if "hitrates" in random_inputfile_information.keys(): + # print(job_repr, "contains hitrates") try: hitrate = sum( [ @@ -162,18 +164,22 @@ async def transfer_files(self, drone, requested_files: dict, job_repr): ] ) / sum([file["usedsize"] for file in requested_files.values()]) provides_file = int(random.random() < hitrate) - await sampling_required.put( - HitrateInfo( - hitrate, - sum([file["usedsize"] for file in requested_files.values()]), - provides_file, - ) - ) + # print(job_repr, hitrate, provides_file) + # input() except ZeroDivisionError: + hitrate = 0 provides_file = 0 + await sampling_required.put( + HitrateInfo( + hitrate, + sum([file["usedsize"] for file in requested_files.values()]), + provides_file, + ) + ) job_repr._read_from_cache = provides_file + # print(job_repr, job_repr._read_from_cache) for inputfilename, inputfilespecs in requested_files.items(): if "hitrates" in inputfilespecs.keys(): From e1c9d4c965c5163fe15abfb308c25b752e5278b7 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Mon, 24 Feb 2020 10:09:22 +0100 Subject: [PATCH 554/648] fixed signature of copy for RankedAutoClusters --- lapis/scheduler.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/lapis/scheduler.py b/lapis/scheduler.py index 26d0653..af2412f 100644 --- a/lapis/scheduler.py +++ b/lapis/scheduler.py @@ -389,10 +389,10 @@ class RankedClusters(Generic[DJ]): def __init__(self, quantization: Dict[str, HTCInt], ranking: Expression): raise NotImplementedError - # @abstractmethod - # def copy(self: RC[DJ]) -> RC[DJ]: - # """Copy the entire ranked auto clusters""" - # raise NotImplementedError + @abstractmethod + def copy(self: "RankedAutoClusters[DJ]") -> "RankedAutoClusters[DJ]": + """Copy the entire ranked auto clusters""" + raise NotImplementedError @abstractmethod def add(self, item: WrappedClassAd[DJ]) -> None: From dd199d3804eb2f3f7bde9fb250de4d9923976a49 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Mon, 24 Feb 2020 10:10:22 +0100 Subject: [PATCH 555/648] added possibility to check for empty drones, closes #92 --- lapis/drone.py | 5 +++++ lapis/scheduler.py | 2 ++ 2 files changed, 7 insertions(+) diff --git a/lapis/drone.py b/lapis/drone.py index 8e025a5..3ba74da 100644 --- a/lapis/drone.py +++ b/lapis/drone.py @@ -22,6 +22,7 @@ def __init__( ignore_resources: list = None, sitename: str = None, connection: Connection = None, + empty: callable = lambda drone: False, ): """ :param scheduler: @@ -51,6 +52,10 @@ def __init__( self._allocation = None self._utilisation = None self._job_queue = Queue() + self._empty = empty + + def empty(self): + return self._empty(self) # caching-related self.jobs_with_cached_data = 0 diff --git a/lapis/scheduler.py b/lapis/scheduler.py index af2412f..06a6a5a 100644 --- a/lapis/scheduler.py +++ b/lapis/scheduler.py @@ -483,6 +483,8 @@ def cluster_groups(self) -> Iterator[List[Set[WrappedClassAd[Drone]]]]: group = [] current_rank = None for ranked_key, drones in self._clusters.items(): + if next(iter(drones))._wrapped.empty(): + continue if ranked_key.rank != current_rank: current_rank = ranked_key.rank if group: From 3388254935bed27258e603748cda159d287a4bb1 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Mon, 24 Feb 2020 10:14:02 +0100 Subject: [PATCH 556/648] added underscore to ranked_key, as it is unused --- lapis/scheduler.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lapis/scheduler.py b/lapis/scheduler.py index 06a6a5a..c1396bb 100644 --- a/lapis/scheduler.py +++ b/lapis/scheduler.py @@ -548,8 +548,8 @@ def items(self) -> Iterator[Tuple[float, Set[WrappedClassAd[DJ]]]]: return iter(self._clusters.items()) def cluster_groups(self) -> Iterator[List[Set[WrappedClassAd[Drone]]]]: - # print(self._clusters.items()) - for ranked_key, drones in self._clusters.items(): + + for _ranked_key, drones in self._clusters.items(): yield [{item} for item in drones] def lookup(self, job: Job): From 17ce915f475051606be61c03043f32b2cdb958e4 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Mon, 24 Feb 2020 14:02:01 +0100 Subject: [PATCH 557/648] skipping scheduling cycles when drones are all empty --- lapis/scheduler.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/lapis/scheduler.py b/lapis/scheduler.py index c1396bb..5117d51 100644 --- a/lapis/scheduler.py +++ b/lapis/scheduler.py @@ -389,6 +389,11 @@ class RankedClusters(Generic[DJ]): def __init__(self, quantization: Dict[str, HTCInt], ranking: Expression): raise NotImplementedError + @abstractmethod + def empty(self) -> bool: + """"Whether there are no resources available""" + raise NotImplementedError + @abstractmethod def copy(self: "RankedAutoClusters[DJ]") -> "RankedAutoClusters[DJ]": """Copy the entire ranked auto clusters""" @@ -437,6 +442,12 @@ def __init__(self, quantization: Dict[str, HTCInt], ranking: Expression): self._clusters: Dict[RankedClusterKey, Set[WrappedClassAd[DJ]]] = SortedDict() self._inverse: Dict[WrappedClassAd[DJ], RankedClusterKey] = {} + def empty(self) -> bool: + for drones in self._clusters.values(): + if not next(iter(drones))._wrapped.empty(): + return False + return True + def copy(self) -> "RankedAutoClusters[DJ]": clone = type(self)(quantization=self._quantization, ranking=self._ranking) clone._clusters = SortedDict( @@ -509,6 +520,12 @@ def __init__(self, quantization: Dict[str, HTCInt], ranking: Expression): self._clusters: Dict[float, Set[WrappedClassAd[DJ]]] = SortedDict() self._inverse: Dict[WrappedClassAd[DJ], float] = {} + def empty(self) -> bool: + for drones in self._clusters.values(): + if not next(iter(drones))._wrapped.empty(): + return False + return True + def copy(self) -> "RankedNonClusters[DJ]": clone = type(self)(quantization=self._quantization, ranking=self._ranking) clone._clusters = SortedDict( @@ -689,6 +706,8 @@ def debug_evaluate(expr, my, target=None): async def _schedule_jobs(self): # Pre Job Rank is the same for all jobs # Use a copy to allow temporary "remainder after match" estimates + if self._drones.empty(): + return pre_job_drones = self._drones.copy() matches: List[Tuple[int, WrappedClassAd[Job], WrappedClassAd[Drone]]] = [] for queue_index, candidate_job in enumerate(self.job_queue): From bd1310bce6d59bb34551c4086324e26bec2068ae Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Mon, 24 Feb 2020 16:08:47 +0100 Subject: [PATCH 558/648] declared drone as empty if < 1 core is free --- lapis/drone.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/lapis/drone.py b/lapis/drone.py index 3ba74da..84f1941 100644 --- a/lapis/drone.py +++ b/lapis/drone.py @@ -22,7 +22,7 @@ def __init__( ignore_resources: list = None, sitename: str = None, connection: Connection = None, - empty: callable = lambda drone: False, + empty: callable = lambda drone: drone.available_resources.get("cores", 1) < 1, ): """ :param scheduler: @@ -54,13 +54,13 @@ def __init__( self._job_queue = Queue() self._empty = empty - def empty(self): - return self._empty(self) - # caching-related self.jobs_with_cached_data = 0 self.cached_data = 0 + def empty(self): + return self._empty(self) + @property def theoretical_available_resources(self): return dict(self.resources.levels) From 0da76f973ed061be90c6d6fa3561e49acdd2576b Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Mon, 24 Feb 2020 17:04:46 +0100 Subject: [PATCH 559/648] fixed skipping scheduling cycles when drones are all empty --- lapis/drone.py | 5 ++++- lapis/scheduler.py | 5 +++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/lapis/drone.py b/lapis/drone.py index 84f1941..d12eeea 100644 --- a/lapis/drone.py +++ b/lapis/drone.py @@ -22,7 +22,10 @@ def __init__( ignore_resources: list = None, sitename: str = None, connection: Connection = None, - empty: callable = lambda drone: drone.available_resources.get("cores", 1) < 1, + empty: callable = lambda drone: drone.theoretical_available_resources.get( + "cores", 1 + ) + < 1, ): """ :param scheduler: diff --git a/lapis/scheduler.py b/lapis/scheduler.py index 5117d51..90490a8 100644 --- a/lapis/scheduler.py +++ b/lapis/scheduler.py @@ -522,8 +522,9 @@ def __init__(self, quantization: Dict[str, HTCInt], ranking: Expression): def empty(self) -> bool: for drones in self._clusters.values(): - if not next(iter(drones))._wrapped.empty(): - return False + for drone in drones: + if not drone._wrapped.empty(): + return False return True def copy(self) -> "RankedNonClusters[DJ]": From 8886d9356cc560c1860fbddf50d523f0bedeea95 Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Thu, 27 Feb 2020 17:28:39 +0100 Subject: [PATCH 560/648] input for testing --- custom_simulate.py | 69 ++++++++++++++++++++++++---------------------- 1 file changed, 36 insertions(+), 33 deletions(-) diff --git a/custom_simulate.py b/custom_simulate.py index 0713b97..f11327f 100644 --- a/custom_simulate.py +++ b/custom_simulate.py @@ -165,36 +165,11 @@ def ini_and_run( if __name__ == "__main__": - # job_file = "/home/tabea/work/testdata/hitratebased/job_list_minimal.json" - job_file = "/home/tabea/work/testdata/modified/job_list_minimal_only_cpu.json" - # job_file = "/home/tabea/work/testdata/modified/job_list_minimal_only_caching.json" - # job_file = "/home/tabea/work/testdata/modified/single_job.json" - # job_file = "/home/tabea/work/testdata/modified/week_25_1.0_0.0_16_input.json" - # job_file = "/home/tabea/work/testdata/fullsim/test_12h_jobinput.json" - # job_file = "/home/tabea/work/testdata/fullsim/resampled_reduced_025week_16_jobinput" \ - # ".json" - # pool_files = ["/home/tabea/work/testdata/hitratebased/sg_machines.csv", - # "/home/tabea/work/testdata/hitratebased/dummycluster.csv"] - # storage_file = "/home/tabea/work/testdata/hitratebased/sg_caches.csv" - # storage_type = "filehitrate" - # - # ini_and_run(job_file=job_file, pool_files=pool_files, storage_file=storage_file, - # storage_type=storage_type, log_file="minimal_hitratebased_test.log", - # log_telegraf=True) - - # job_file = "/home/tabea/work/testdata/hitratebased/testjobs.json" - # job_file = "/home/tabea/work/testdata/hitratebased/week.json" - # job_file = "/home/tabea/work/testdata/hitratebased/day_jobinput.json" - # job_file = "/home/tabea/work/testdata/hitratebased/week_1_sample_time_jobinput.json" - pool_files = [ - "/home/tabea/work/testdata/fullsim/sg_machines_shared_cache.csv", - "/home/tabea/work/testdata/fullsim/dummycluster.csv", - ] - # pool_files = ["/home/tabea/work/testdata/fullsim/minimal_pool.csv"] - # pool_files = ["/home/tabea/work/testdata/fullsim/dummycluster.csv"] - # pool_files = ["/home/tabea/work/testdata/fullsim/sg_machines_shared_cache.csv"] - # storage_file = "/home/tabea/work/testdata/fullsim/sg_caches_shared.csv" - storage_file = "/home/tabea/work/testdata/fullsim/minimal_cache.csv" + job_file = "/home/tabea/work/testdata/hitratebased/job_list_minimal_only_cpu.json" + + pool_files = ["/home/tabea/work/testdata/fullsim/minimal_pool.csv"] + + storage_file = "/home/tabea/work/testdata/fullsim/sg_caches_shared.csv" storage_type = "filehitrate" ini_and_run( job_file=job_file, @@ -205,6 +180,7 @@ def ini_and_run( storage_type=storage_type, log_file="test_new_scheduler.log", log_telegraf=True, + seed=1234, # pre_job_rank="10000000 * my.Rank + 1000000 - 100000 * my.cpus - my.memory", pre_job_rank="0", machine_ads=""" @@ -212,8 +188,35 @@ def ini_and_run( rank = 0 """.strip(), job_ads=""" - Requirements = my.requestcpus <= target.cpus && my.requestmemory <= target.memory - Rank = 0""", + Requirements = my.requestcpus <= target.cpus && my.requestmemory <= target.memory Rank = target.cached_data """, ) - # target.cached_data * (target.cache_demand > 0.1) + +# job_file = "/home/tabea/work/testdata/hitratebased/job_list_minimal.json" + +# job_file = "/home/tabea/work/testdata/modified/job_list_minimal_only_caching.json" +# job_file = "/home/tabea/work/testdata/modified/single_job.json" +# job_file = "/home/tabea/work/testdata/modified/week_25_1.0_0.0_16_input.json" +# job_file = "/home/tabea/work/testdata/fullsim/test_12h_jobinput.json" +# job_file = "/home/tabea/work/testdata/fullsim/resampled_reduced_025week_16_jobinput" \ +# ".json" +# pool_files = ["/home/tabea/work/testdata/hitratebased/sg_machines.csv", +# "/home/tabea/work/testdata/hitratebased/dummycluster.csv"] +# storage_file = "/home/tabea/work/testdata/hitratebased/sg_caches.csv" +# storage_type = "filehitrate" +# +# ini_and_run(job_file=job_file, pool_files=pool_files, storage_file=storage_file, +# storage_type=storage_type, log_file="minimal_hitratebased_test.log", +# log_telegraf=True) + +# job_file = "/home/tabea/work/testdata/hitratebased/testjobs.json" +# job_file = "/home/tabea/work/testdata/hitratebased/week.json" +# job_file = "/home/tabea/work/testdata/hitratebased/day_jobinput.json" +# job_file = "/home/tabea/work/testdata/hitratebased/week_1_sample_time_jobinput.json" +# pool_files = [ +# "/home/tabea/work/testdata/modified/sg_machines_shared_cache.csv", +# "/home/tabea/work/testdata/modified/dummycluster_888cores_split.csv", +# ] +# pool_files = ["/home/tabea/work/testdata/fullsim/dummycluster.csv"] +# pool_files = ["/home/tabea/work/testdata/fullsim/sg_machines_shared_cache.csv"] +# storage_file = "/home/tabea/work/testdata/fullsim/minimal_cache.csv" From 272baeb3784c17638eb7946389874102051e642d Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Thu, 27 Feb 2020 17:56:13 +0100 Subject: [PATCH 561/648] added early exit for updated autoclusters, closes 93 --- lapis/scheduler.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/lapis/scheduler.py b/lapis/scheduler.py index 90490a8..e926cf3 100644 --- a/lapis/scheduler.py +++ b/lapis/scheduler.py @@ -54,6 +54,12 @@ def __init__(self, classad: ClassAd, wrapped: DJ): self._data = classad._data self._temp = {} + def empty(self): + try: + return self._temp["cores"] < 1 + except KeyError: + return self._wrapped.theoretical_available_resources["cores"] < 1 + def __getitem__(self, item): def access_wrapped(name, requested=True): if isinstance(self._wrapped, Drone): @@ -444,7 +450,7 @@ def __init__(self, quantization: Dict[str, HTCInt], ranking: Expression): def empty(self) -> bool: for drones in self._clusters.values(): - if not next(iter(drones))._wrapped.empty(): + if not next(iter(drones)).empty(): return False return True @@ -494,7 +500,7 @@ def cluster_groups(self) -> Iterator[List[Set[WrappedClassAd[Drone]]]]: group = [] current_rank = None for ranked_key, drones in self._clusters.items(): - if next(iter(drones))._wrapped.empty(): + if next(iter(drones)).empty(): continue if ranked_key.rank != current_rank: current_rank = ranked_key.rank @@ -523,7 +529,7 @@ def __init__(self, quantization: Dict[str, HTCInt], ranking: Expression): def empty(self) -> bool: for drones in self._clusters.values(): for drone in drones: - if not drone._wrapped.empty(): + if not drone.empty(): return False return True @@ -740,6 +746,8 @@ async def _schedule_jobs(self): candidate_job._wrapped._cached_data = ( matched_drone._wrapped.cached_data ) + if pre_job_drones.empty(): + break if not matches: return # TODO: optimize for few matches, many matches, all matches From 95794bd723dbcb49be1eb1f57d915eb87cc2eb18 Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Fri, 28 Feb 2020 09:52:05 +0100 Subject: [PATCH 562/648] extended job_event monitoring by adding cache probability and expectation of amount of cached data --- lapis/job.py | 23 +++++++++++++++++++++++ lapis/monitor/general.py | 2 ++ 2 files changed, 25 insertions(+) diff --git a/lapis/job.py b/lapis/job.py index 8c0f1c2..c64a12d 100644 --- a/lapis/job.py +++ b/lapis/job.py @@ -34,6 +34,8 @@ class Job(object): "_transfer_time", "failed_matches", "cputime", + "cache_probability", + "expectation_cached_data", ) def __init__( @@ -96,6 +98,24 @@ def __init__( ) except AttributeError: self._total_input_data = 0 + if self._total_input_data: + self.expectation_cached_data = sum( + [ + file["usedsize"] * sum(file["hitrates"].values()) + for file in self.used_inputfiles.values() + ] + ) + else: + self.expectation_cached_data = 0 + if self._total_input_data: + self.cache_probability = sum( + [ + file["usedsize"] * sum(file["hitrates"].values()) + for file in self.used_inputfiles.values() + ] + ) / sum([file["usedsize"] for file in self.used_inputfiles.values()]) + else: + self.cache_probability = 0 self.failed_matches = 0 @property @@ -179,12 +199,15 @@ async def run(self, drone: "Drone"): scope.do(self._transfer_inputfiles()) scope.do(self._calculate()) except CancelTask: + print("CancelTask") # self.drone = None self._success = False + # await sampling_required.put(self) # TODO: in_queue_until is still set except BaseException: # self.drone = None self._success = False + await sampling_required.put(self) # TODO: in_queue_until is still set raise else: diff --git a/lapis/monitor/general.py b/lapis/monitor/general.py index 668fb77..ab5ba9f 100644 --- a/lapis/monitor/general.py +++ b/lapis/monitor/general.py @@ -180,6 +180,8 @@ def job_events(job: Job) -> List[Dict]: result["data_througput"] = ( job._total_input_data / 1000.0 / 1000.0 / 1000.0 / job.walltime ) + result["cache_probability"] = job.cache_probability + result["expectation_cached_data"] = job.expectation_cached_data else: result["success"] = 0 error_logged = False From 7e3487c4ecd770712d72f9ce9629e86eb1b7d636 Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Sun, 10 May 2020 20:34:25 +0200 Subject: [PATCH 563/648] re-added import of usim Queue --- lapis/drone.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapis/drone.py b/lapis/drone.py index d12eeea..c5803de 100644 --- a/lapis/drone.py +++ b/lapis/drone.py @@ -1,6 +1,6 @@ from cobald import interfaces -from usim import time, Scope, instant, Capacities, ResourcesUnavailable +from usim import time, Scope, instant, Capacities, ResourcesUnavailable, Queue from typing import Optional from lapis.job import Job From 1a87d844225230c4795a9125f35f97b40fca26da Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Sun, 10 May 2020 20:44:47 +0200 Subject: [PATCH 564/648] removed unnecessary debug output --- lapis/connection.py | 14 -------------- lapis/drone.py | 6 ------ lapis/job.py | 16 ---------------- lapis/monitor/caching.py | 2 -- lapis/monitor/duplicates.py | 1 - lapis/monitor/general.py | 1 - lapis/scheduler.py | 29 ----------------------------- lapis/storageelement.py | 32 -------------------------------- 8 files changed, 101 deletions(-) diff --git a/lapis/connection.py b/lapis/connection.py index 32d3264..e594400 100644 --- a/lapis/connection.py +++ b/lapis/connection.py @@ -131,13 +131,7 @@ async def stream_file( ) except KeyError: pass - # print(f"now transfering {requested_file.filesize} from {used_connection}") await used_connection.transfer(requested_file, job_repr=job_repr) - # print( - # "Job {}: finished transfering of file {}: {}B @ {}".format( - # job_repr, requested_file.filename, requested_file.filesize, time.now - # ) - # ) async def transfer_files(self, drone, requested_files: dict, job_repr): """ @@ -153,9 +147,7 @@ async def transfer_files(self, drone, requested_files: dict, job_repr): # decision if a jobs inputfiles are cached based on hitrate random_inputfile_information = next(iter(requested_files.values())) - # print(time.now, job_repr, requested_files, drone.sitename) if "hitrates" in random_inputfile_information.keys(): - # print(job_repr, "contains hitrates") try: hitrate = sum( [ @@ -164,9 +156,7 @@ async def transfer_files(self, drone, requested_files: dict, job_repr): ] ) / sum([file["usedsize"] for file in requested_files.values()]) provides_file = int(random.random() < hitrate) - # print(job_repr, hitrate, provides_file) - # input() except ZeroDivisionError: hitrate = 0 provides_file = 0 @@ -179,7 +169,6 @@ async def transfer_files(self, drone, requested_files: dict, job_repr): ) ) job_repr._read_from_cache = provides_file - # print(job_repr, job_repr._read_from_cache) for inputfilename, inputfilespecs in requested_files.items(): if "hitrates" in inputfilespecs.keys(): @@ -193,7 +182,4 @@ async def transfer_files(self, drone, requested_files: dict, job_repr): ) await self.stream_file(requested_file, drone.sitename, job_repr) stream_time = time.now - start_time - # print( - # "STREAMED files {} in {}".format(list(requested_files.keys()), stream_time) - # ) return stream_time diff --git a/lapis/drone.py b/lapis/drone.py index c5803de..4d2efdb 100644 --- a/lapis/drone.py +++ b/lapis/drone.py @@ -198,13 +198,7 @@ async def _run_job(self, job: Job, kill: bool): pass # self.scheduler.update_drone(self) await job_execution.done - # print( - # "finished job {} on drone {} @ {}".format( - # repr(job), repr(self), time.now - # ) - # ) except ResourcesUnavailable: - # print(repr(job), "ResourcesUnavailable") await instant job_execution.cancel() await instant diff --git a/lapis/job.py b/lapis/job.py index c64a12d..a66884d 100644 --- a/lapis/job.py +++ b/lapis/job.py @@ -145,11 +145,6 @@ async def _calculate(self): :param calculation_efficiency: :return: """ - # print( - # f"WALLTIME: Job {self} @ {time.now}, " - # f"{self.used_resources.get('cores', None)}, " - # f"{self.calculation_efficiency}" - # ) result = self.walltime try: if ( @@ -165,21 +160,14 @@ async def _calculate(self): except (KeyError, TypeError): pass - # start = time.now await (time + result) - # print(f"finished calculation at {time.now - start}") async def _transfer_inputfiles(self): start = time.now try: - # print(f"TRANSFERING INPUTFILES: Job {self} @ {start}") await self.drone.connection.transfer_files( drone=self.drone, requested_files=self.used_inputfiles, job_repr=self ) - # print( - # f"streamed inputfiles {self.used_inputfiles.keys()} for job {self} " - # f"in {time.now - start} timeunits, finished @ {time.now}" - # ) except AttributeError: pass self._transfer_time = time.now - start @@ -190,7 +178,6 @@ async def run(self, drone: "Drone"): self.in_queue_until = time.now self._success = None await sampling_required.put(self) - # print("running job {} in drone {}".format(repr(self), repr(self.drone))) try: start = time.now @@ -211,10 +198,7 @@ async def run(self, drone: "Drone"): # TODO: in_queue_until is still set raise else: - # old_walltime = self.walltime self.walltime = time.now - start - # print(f"monitored walltime of {old_walltime} changed to {self.walltime}") - # self.drone = None self._success = True await sampling_required.put(self) diff --git a/lapis/monitor/caching.py b/lapis/monitor/caching.py index 1de850f..352475c 100644 --- a/lapis/monitor/caching.py +++ b/lapis/monitor/caching.py @@ -115,8 +115,6 @@ def pipe_status(pipeinfo: MonitoredPipeInfo) -> list: # :param storage: # :return: # """ - # print(pipeinfo) - # print(pipeinfo.requested_throughput) results = [ { "pipe": repr(pipeinfo.pipename), diff --git a/lapis/monitor/duplicates.py b/lapis/monitor/duplicates.py index a599670..25de57a 100644 --- a/lapis/monitor/duplicates.py +++ b/lapis/monitor/duplicates.py @@ -24,7 +24,6 @@ def user_demand_tmp(user_demand: UserDemand) -> List[Dict]: :param scheduler: the scheduler :return: list of records for logging """ - # print("user_demand", job_queue) result = [{"value": user_demand.value}] return result diff --git a/lapis/monitor/general.py b/lapis/monitor/general.py index ab5ba9f..c9e1d13 100644 --- a/lapis/monitor/general.py +++ b/lapis/monitor/general.py @@ -77,7 +77,6 @@ def user_demand(job_queue: JobQueue) -> List[Dict]: :param scheduler: the scheduler :return: list of records for logging """ - # print("user_demand", job_queue) result = [{"value": len(job_queue)}] return result diff --git a/lapis/scheduler.py b/lapis/scheduler.py index e926cf3..40e0024 100644 --- a/lapis/scheduler.py +++ b/lapis/scheduler.py @@ -97,11 +97,7 @@ def access_wrapped(name, requested=True): caches = self._wrapped.connection.storages.get( self._wrapped.sitename, None ) - # print(caches) try: - # print(mean( - # [1. / cache.connection._throughput_scale for cache in caches] - # )) return mean( [1.0 / cache.connection._throughput_scale for cache in caches] ) @@ -112,11 +108,7 @@ def access_wrapped(name, requested=True): caches = self._wrapped.connection.storages.get( self._wrapped.sitename, None ) - # print(caches) try: - # print(mean( - # [cache.connection._throughput_scale for cache in caches] - # )) return mean( [cache.connection._throughput_scale for cache in caches] ) @@ -129,11 +121,6 @@ def access_wrapped(name, requested=True): ) try: - # print(sum( - # [cache.connection.throughput / 1000. / 1000. / 1000. for cache - # in - # caches] - # ) / float(self._wrapped.pool_resources["cores"])) return sum( [ cache.connection.throughput / 1000.0 / 1000.0 / 1000.0 @@ -144,7 +131,6 @@ def access_wrapped(name, requested=True): return 0 elif "cached_data" == item: - # print(self._wrapped, self._wrapped.cached_data / 1000. / 1000. / 1000.) return self._wrapped.cached_data / 1000.0 / 1000.0 / 1000.0 elif "data_volume" == item: @@ -154,12 +140,9 @@ def access_wrapped(name, requested=True): return time.now - self._wrapped.queue_date elif "failed_matches" == item: - # print("evaluated", self._wrapped, self._wrapped.failed_matches) return self._wrapped.failed_matches elif "jobs_with_cached_data" == item: - # print(self._wrapped) - # print(self._wrapped.jobs_with_cached_data) return self._wrapped.jobs_with_cached_data return super(WrappedClassAd, self).__getitem__(item) @@ -611,15 +594,6 @@ def __init__( self._drones: RankedClusters[Drone] = RankedNonClusters( quantization=quantization_defaults, ranking=parse(pre_job_rank) ) - # if autocluster: - # self._drones: RankedClusters[Drone] = RankedAutoClusters( - # quantization=quantization_defaults, ranking=parse(pre_job_rank) - # ) - # else: - # self._drones: RankedClusters[Drone] = RankedNonClusters( - # quantization=quantization_defaults, ranking=parse(pre_job_rank) - # ) - self.interval = interval self.job_queue = JobQueue() self._collecting = True @@ -669,7 +643,6 @@ def debug_evaluate(expr, my, target=None): if type(expr) is str: expr = my[expr] result = expr.evaluate(my=my, target=target) - # print(f'>>> {expr}, {my}, {target}\n... {result}') return result if job["Requirements"] != Undefined(): @@ -719,8 +692,6 @@ async def _schedule_jobs(self): matches: List[Tuple[int, WrappedClassAd[Job], WrappedClassAd[Drone]]] = [] for queue_index, candidate_job in enumerate(self.job_queue): try: - # print(time.now, candidate_job._wrapped, - # candidate_job._wrapped.requested_inputfiles) pre_job_drones.lookup(candidate_job._wrapped) matched_drone = self._match_job( candidate_job, pre_job_drones.cluster_groups() diff --git a/lapis/storageelement.py b/lapis/storageelement.py index f58c477..bb564c9 100644 --- a/lapis/storageelement.py +++ b/lapis/storageelement.py @@ -168,21 +168,11 @@ def find(self, requested_file: RequestedFile, job_repr=None): :param job_repr: Needed for debug output, will be replaced :return: (amount of cached data, storage object) """ - # print( - # "LOOK UP FILE: Job {}, File {}, Storage {} @ {}".format( - # job_repr, requested_file.filename, repr(self), time.now - # ) - # ) try: result = LookUpInformation( self.files[requested_file.filename].filesize, self ) except KeyError: - # print( - # "File {} not cached on any reachable storage".format( - # requested_file.filename - # ) - # ) result = LookUpInformation(0, self) return result @@ -218,16 +208,6 @@ def used(self): return 0 async def transfer(self, file: RequestedFile, job_repr=None): - # print( - # "TRANSFER: {}, filesize {}, remote: {}/{}, cache: {}/{}".format( - # self._hitrate, - # file.filesize, - # (1 - self._hitrate) * file.filesize, - # self.remote_storage.connection.throughput, - # self._hitrate * file.filesize, - # self.connection.throughput, - # ) - # ) async with Scope() as scope: logging.getLogger("implementation").warning( "{} {} @ {} in {}".format( @@ -280,17 +260,6 @@ def used(self): return 0 async def transfer(self, file: RequestedFile_HitrateBased, job_repr=None): - # print( - # "TRANSFER: on {} with {}, filesize {}, remote: {}/{}, cache: {}/{}".format( - # self.name, - # file.cachehitrate, - # file.filesize, - # (1 - file.cachehitrate) * file.filesize, - # self.remote_storage.connection.throughput, - # file.cachehitrate * file.filesize, - # self.connection.throughput, - # ) - # ) if file.cachehitrate: await self.connection.transfer(total=file.filesize) await sampling_required.put(self.connection) @@ -300,7 +269,6 @@ async def transfer(self, file: RequestedFile_HitrateBased, job_repr=None): raise ValueError def find(self, requested_file: RequestedFile_HitrateBased, job_repr=None): - # return LookUpInformation(requested_file.filesize, self) return LookUpInformation( requested_file.filesize * requested_file.cachehitrate, self ) From 0473c9eafe04e6e3ff5329e8b6bbe0f7bbaa4b9f Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Sun, 10 May 2020 20:46:53 +0200 Subject: [PATCH 565/648] cleaned up custom_simulate --- custom_simulate.py | 57 ---------------------------------------------- 1 file changed, 57 deletions(-) diff --git a/custom_simulate.py b/custom_simulate.py index f11327f..6820b6a 100644 --- a/custom_simulate.py +++ b/custom_simulate.py @@ -163,60 +163,3 @@ def ini_and_run( simulator.run(until=until) -if __name__ == "__main__": - - job_file = "/home/tabea/work/testdata/hitratebased/job_list_minimal_only_cpu.json" - - pool_files = ["/home/tabea/work/testdata/fullsim/minimal_pool.csv"] - - storage_file = "/home/tabea/work/testdata/fullsim/sg_caches_shared.csv" - storage_type = "filehitrate" - ini_and_run( - job_file=job_file, - remote_throughput=0.75, - calculation_efficiency=0.9, - pool_files=pool_files, - storage_file=storage_file, - storage_type=storage_type, - log_file="test_new_scheduler.log", - log_telegraf=True, - seed=1234, - # pre_job_rank="10000000 * my.Rank + 1000000 - 100000 * my.cpus - my.memory", - pre_job_rank="0", - machine_ads=""" - requirements = target.requestcpus <= my.cpus - rank = 0 - """.strip(), - job_ads=""" - Requirements = my.requestcpus <= target.cpus && my.requestmemory <= target.memory Rank = target.cached_data """, - ) - - -# job_file = "/home/tabea/work/testdata/hitratebased/job_list_minimal.json" - -# job_file = "/home/tabea/work/testdata/modified/job_list_minimal_only_caching.json" -# job_file = "/home/tabea/work/testdata/modified/single_job.json" -# job_file = "/home/tabea/work/testdata/modified/week_25_1.0_0.0_16_input.json" -# job_file = "/home/tabea/work/testdata/fullsim/test_12h_jobinput.json" -# job_file = "/home/tabea/work/testdata/fullsim/resampled_reduced_025week_16_jobinput" \ -# ".json" -# pool_files = ["/home/tabea/work/testdata/hitratebased/sg_machines.csv", -# "/home/tabea/work/testdata/hitratebased/dummycluster.csv"] -# storage_file = "/home/tabea/work/testdata/hitratebased/sg_caches.csv" -# storage_type = "filehitrate" -# -# ini_and_run(job_file=job_file, pool_files=pool_files, storage_file=storage_file, -# storage_type=storage_type, log_file="minimal_hitratebased_test.log", -# log_telegraf=True) - -# job_file = "/home/tabea/work/testdata/hitratebased/testjobs.json" -# job_file = "/home/tabea/work/testdata/hitratebased/week.json" -# job_file = "/home/tabea/work/testdata/hitratebased/day_jobinput.json" -# job_file = "/home/tabea/work/testdata/hitratebased/week_1_sample_time_jobinput.json" -# pool_files = [ -# "/home/tabea/work/testdata/modified/sg_machines_shared_cache.csv", -# "/home/tabea/work/testdata/modified/dummycluster_888cores_split.csv", -# ] -# pool_files = ["/home/tabea/work/testdata/fullsim/dummycluster.csv"] -# pool_files = ["/home/tabea/work/testdata/fullsim/sg_machines_shared_cache.csv"] -# storage_file = "/home/tabea/work/testdata/fullsim/minimal_cache.csv" From 7ce1f048f0206b0d54de2de0dc0a7d97d24b403f Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Thu, 17 Sep 2020 14:47:03 +0200 Subject: [PATCH 566/648] fixed TestSimulationTimeFilter test_explicit() test by using pytest.approx --- lapis_tests/utility/test_monitor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapis_tests/utility/test_monitor.py b/lapis_tests/utility/test_monitor.py index 533e9eb..af009e8 100644 --- a/lapis_tests/utility/test_monitor.py +++ b/lapis_tests/utility/test_monitor.py @@ -54,7 +54,7 @@ def record(): filter = SimulationTimeFilter() async with Scope() as _: filter.filter(record) - assert record.created == 0 + assert record.created == pytest.approx(0, abs=1e-9) def dummy_statistics(): From 0a3ed6f3f396186e2357755781d9d333c3f3273e Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Fri, 18 Sep 2020 14:01:22 +0200 Subject: [PATCH 567/648] ensured pyparse > 2.4.1 (workaround for https://github.com/MaineKuehn/classad/pull/24/) --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index bb3662b..888111e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,7 +25,7 @@ classifiers = [ 'Programming Language :: Python :: 3.7' ] -requires = ["cobald", "usim >= 0.4.3", "click", "classad"] +requires = ["cobald", "usim >= 0.4.3", "click", "classad", "pyparsing > 2.4.1"] [tool.flit.metadata.requires-extra] test = [ From e77b462765005d157de1911210c1379537c201bf Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Mon, 21 Sep 2020 09:24:17 +0200 Subject: [PATCH 568/648] adapted conversion to GB in unit tests to match conversion defined in storage_io/htcondor.py --- lapis_tests/storage_io/test_storage.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lapis_tests/storage_io/test_storage.py b/lapis_tests/storage_io/test_storage.py index 68adb46..559014f 100644 --- a/lapis_tests/storage_io/test_storage.py +++ b/lapis_tests/storage_io/test_storage.py @@ -51,8 +51,8 @@ def test_simple_read(self): ): assert storage is not None assert type(storage.available) == int - assert storage.available == int(5.0 * 1024 * 1024 * 1024) + assert storage.available == int(5.0 * 1000 * 1000 * 1000) assert type(storage.size) == int - assert storage.size == int(10.0 * 1024 * 1024 * 1024) + assert storage.size == int(10.0 * 1000 * 1000 * 1000) count += 1 assert count == 1 From 6b2a0c30c146a5e018abc6f468785132c5b6fc34 Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Mon, 21 Sep 2020 09:35:50 +0200 Subject: [PATCH 569/648] only start pipe monitoring if connections and therefore pipes are actually present --- lapis/simulator.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lapis/simulator.py b/lapis/simulator.py index 20a3142..db9c51f 100644 --- a/lapis/simulator.py +++ b/lapis/simulator.py @@ -115,7 +115,8 @@ async def _simulate(self, end): for controller in self.controllers: while_running.do(controller.run(), volatile=True) while_running.do(self.monitoring.run(), volatile=True) - while_running.do(self.connection.run_pipemonitoring(), volatile=True) + if self.connection: + while_running.do(self.connection.run_pipemonitoring(), volatile=True) self.duration = time.now print( f"[lapis-{monitor.SIMULATION_START}] Finished simulation at {self.duration}" From db1410620ddef1ec383b1700f0d83f2dff75d22c Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Mon, 21 Sep 2020 10:02:46 +0200 Subject: [PATCH 570/648] added mock Job object to fix unit test that failed because `reads_from_cache` attribute was not set, added `hitrates` information to test cases --- lapis_tests/__init__.py | 6 ++++++ lapis_tests/test_caching_hitrate_based.py | 13 ++++++++----- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/lapis_tests/__init__.py b/lapis_tests/__init__.py index 562d4c0..7a4b971 100644 --- a/lapis_tests/__init__.py +++ b/lapis_tests/__init__.py @@ -79,3 +79,9 @@ def __init__(self, throughput: Optional[float] = None): self.connection = Connection(throughput) else: self.connection = None + + +class DummyJob: + + def __init__(self, reads_from_cache=False): + self.reads_from_cache = reads_from_cache diff --git a/lapis_tests/test_caching_hitrate_based.py b/lapis_tests/test_caching_hitrate_based.py index 7fd7fc0..912bee8 100644 --- a/lapis_tests/test_caching_hitrate_based.py +++ b/lapis_tests/test_caching_hitrate_based.py @@ -3,7 +3,7 @@ import json from functools import partial -from lapis_tests import via_usim, DummyDrone +from lapis_tests import via_usim, DummyDrone, DummyJob from lapis.connection import Connection from lapis.storageelement import HitrateStorage from lapis.storage_io.storage import storage_reader @@ -66,11 +66,12 @@ async def test_single_transfer_files(self): throughput = 10 size = 1000 drone = DummyDrone(throughput) - requested_files = dict(test=dict(usedsize=100)) + job = DummyJob(True) + requested_files = dict(test=dict(usedsize=100, hitrates={drone.sitename: 1.0})) hitratestorage = HitrateStorage(hitrate=0.5, size=size, files={}) drone.connection.add_storage_element(hitratestorage) stream_time = await drone.connection.transfer_files( - drone=drone, requested_files=requested_files, job_repr="test" + drone=drone, requested_files=requested_files, job_repr=job ) assert time.now == 5 @@ -81,11 +82,13 @@ async def test_simultaneous_transfer(self): throughput = 10 size = 1000 drone = DummyDrone(throughput) - requested_files = dict(test1=dict(usedsize=100), test2=dict(usedsize=200)) + job = DummyJob(True) + requested_files = dict(test1=dict(usedsize=100, hitrates={drone.sitename: 1.0}), + test2=dict(usedsize=200, hitrates={drone.sitename: 1.0})) hitratestorage = HitrateStorage(hitrate=0.5, size=size, files={}) drone.connection.add_storage_element(hitratestorage) stream_time = await drone.connection.transfer_files( - drone=drone, requested_files=requested_files + drone=drone, requested_files=requested_files, job_repr=job ) assert time.now == 15 assert stream_time == 15 From 3015991ea4edc3dcc59ad5381eccf1d43fdc9ef6 Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Tue, 22 Sep 2020 13:20:09 +0200 Subject: [PATCH 571/648] Added draft for job class documentation --- lapis/job.py | 106 ++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 97 insertions(+), 9 deletions(-) diff --git a/lapis/job.py b/lapis/job.py index a66884d..0bbcc9c 100644 --- a/lapis/job.py +++ b/lapis/job.py @@ -11,6 +11,56 @@ class Job(object): + """ + Objects of this class represent jobs. The job is described from the batch + system's viewpoint by the following attributes: + :param resources: information about the resources requested by the job + :param used_resources: information about the resources used by the job + :param walltime: the job's runtime, in reality as well as in the simulation + :param requested_walltime: walltime requested by the job + :param cputime: the cumulated amount of time the used CPU(s) was (were) active + during the job's execution + :param queue_date: time when the job was submitted to the simulated job queue + _success: represents whether the job was run successfully + + In addition, functionality allowing the simulation of data transfers is provided. In + this case, the job attributes have to be extended by information about the job's + input data. In this case the job's runtime is recalculated if the job processes + input data and is executed on resources with access to caches. In this case data + transfer and processing are assumed to be done in parallel. This is a valid + assumption if the input data are divided into blocks, transfered + throughout the job's runtime and if already transferred data blocks are processed + while other blocks are fetched. If the job's overall runtime is long and if the + data set was transferred in a large number of blocks, the job's runtime (walltime) + can be recalculated using max(calculation time, transfer time). + + :param requested_inputfiles: information about the input files requested by a job + :param used_inputfiles: information about the input files actually read by a job + :param _total_input_data: data volume of used_inputfiles, amount of data + processed by the job + :param _original_walltime: the job's walltime as the simulation's input + states. Is stored for monitoring purposes as the job's walltime can be + altered + :param _calculation_time: the calculation time represents the time needed to + process the job's input data + :param calculation_efficiency: represents the efficiency of calculations + performed on the job's input data. Default = 1.0. Can be modified to take + programmatical insufficiencies into account. + :param _transfer_time: the transfer time represents the time needed to + transfer the job's input data. + + As the simulation of data transfers is used to simulate and study caching, + the following metadata are introduced and used for monitoring purposes. + :param _read_from_cache: true if job read data from cache + :param _cached_data: the amount of input data that is currently cached + :param failed_matches: number of times a match of this job to a resource was + rejected (see scheduler for details) + :param cache_probability: the averaged probability that all data are cached + (sum(filesize * hitrate = probability that file is cached) / sum(filesize)) + :param expectation_cached_data: the expectation value for the amount of + cached data (sum(filesize * hitrate)) + """ + __slots__ = ( "resources", "used_resources", @@ -46,21 +96,21 @@ def __init__( queue_date: float = 0, name: Optional[str] = None, drone: "Optional[Drone]" = None, - calculation_efficiency: Optional[float] = None, + calculation_efficiency: Optional[float] = 1.0, ): """ - Definition of a job that uses a specified amount of resources `used_resources` - over a given amount of time, `walltime`. A job is described by its user - via the parameter `resources`. This is a user prediction and is expected - to deviate from `used_resources`. + Initialization of a job - :param resources: Requested resources of the job - :param used_resources: Resource usage of the job + :param resources: Requested resources of the job, including walltime and + input data + :param used_resources: Resource usage of the job, including walltime and + input data :param in_queue_since: Time when job was inserted into the queue of the simulation scheduler :param queue_date: Time when job was inserted into queue in real life :param name: Name of the job :param drone: Drone where the job is running on + :param calculation_efficiency: """ self.resources = resources self.used_resources = used_resources @@ -91,13 +141,23 @@ def __init__( self._original_walltime = self.walltime self._calculation_time = 0 self._transfer_time = 0 - self.cputime = self.used_resources["cores"] * self.walltime + + # TODO: this try-except is a fix for a unit test, check whether this makes + # sense in all use cases + try: + self.cputime = self.used_resources["cores"] * self.walltime + except KeyError: + self.cputime = None + try: self._total_input_data = sum( [fileinfo["usedsize"] for fileinfo in self.used_inputfiles.values()] ) except AttributeError: self._total_input_data = 0 + + # TODO: see unit test test_read_with_inputfiles -> decide whether making + # information about hitrates obilgatory is actually necessary if self._total_input_data: self.expectation_cached_data = sum( [ @@ -107,6 +167,7 @@ def __init__( ) else: self.expectation_cached_data = 0 + if self._total_input_data: self.cache_probability = sum( [ @@ -116,6 +177,7 @@ def __init__( ) / sum([file["usedsize"] for file in self.used_inputfiles.values()]) else: self.cache_probability = 0 + self.failed_matches = 0 @property @@ -129,7 +191,7 @@ def successful(self) -> Optional[bool]: @property def waiting_time(self) -> float: """ - The time the job spent in the simulators scheduling queue. `Inf` when + Determines the time the job spent in the simulated scheduling queue. `Inf` when the job is still waiting. :return: Time in queue @@ -142,6 +204,13 @@ async def _calculate(self): """ Determines a jobs calculation time based on the jobs CPU time and a calculation efficiency representing inefficient programming. + + If a job contains input files and the drone the job runs on has a defined remote + connection (throughput < Inf) the calculation time is given by job's CPU time + divided by a configurable `calculation_efficiency` that can be set != 1, e.g. to + account for programmatical inefficiencies. + + Else, the calculation time remains equal to the job's orginal `walltime`. :param calculation_efficiency: :return: """ @@ -170,9 +239,20 @@ async def _transfer_inputfiles(self): ) except AttributeError: pass + print("end transfer files ", time.now) self._transfer_time = time.now - start async def run(self, drone: "Drone"): + """ + Handles the job's execution. + The job's runtime is given by max(calculation time, transfer time). + The calculation time is determined by `_calculate`, the transfer time by + `_transfer_inputfiles`. + The job will be executed successfully unless the selected drone does not + provide enough resources, is unavailable or an exception occurs. + :param drone: the drone object the job was allocated to and is executed in + :return: + """ assert drone, "Jobs cannot run without a drone being assigned" self.drone = drone self.in_queue_until = time.now @@ -181,6 +261,7 @@ async def run(self, drone: "Drone"): try: start = time.now + print(start) async with Scope() as scope: await instant scope.do(self._transfer_inputfiles()) @@ -207,6 +288,13 @@ def __repr__(self): async def job_to_queue_scheduler(job_generator, job_queue): + """ + Handles reading the simulation's job input and puts the job's into the job queue + :param job_generator: reader object that yields jobs from input + :param job_queue: queue the jobs are added to + :return: + + """ base_date = None for job in job_generator: if base_date is None: From adcf575e7f3af9415a06bedbf57147d265495e7c Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Tue, 22 Sep 2020 13:22:35 +0200 Subject: [PATCH 572/648] Added draft for storage classes documentation --- lapis/interfaces/_storage.py | 4 ++ lapis/storageelement.py | 131 +++++++++++++++++++++++++++++------ 2 files changed, 114 insertions(+), 21 deletions(-) diff --git a/lapis/interfaces/_storage.py b/lapis/interfaces/_storage.py index 78f7dfc..7ff3905 100644 --- a/lapis/interfaces/_storage.py +++ b/lapis/interfaces/_storage.py @@ -11,6 +11,10 @@ class LookUpInformation(NamedTuple): class Storage(metaclass=abc.ABCMeta): + """ + This class represents the basic structures of all representations of storage + in this simulation. + """ @property @abc.abstractmethod def size(self) -> int: diff --git a/lapis/storageelement.py b/lapis/storageelement.py index bb564c9..6d8d23f 100644 --- a/lapis/storageelement.py +++ b/lapis/storageelement.py @@ -11,7 +11,21 @@ class RemoteStorage(Storage): + """ + The RemoteStorage object represents the entirety of (WLCG) grid storage. All + files that can be requested by a job are provided by remote storage and it's size + is therefore approximated as infinite. Files are transferred from this storage + via the associated pipe, a network bandwidth model. There can be multiple remote + storages in the simulation because resource pools may have differing network + connections. + """ + # TODO:: ensure that there can be multiple remote storages in the simulation def __init__(self, pipe: MonitoredPipe): + """ + Initialization of the remote storages pipe, representing the network + connection to remote storage with a limited bandwidth. + :param pipe: + """ self.connection = pipe pipe.storage = repr(self) @@ -28,20 +42,41 @@ def used(self): return 0 async def transfer(self, file: RequestedFile, **kwargs): + """ + Simulates the transfer of a requested file via the remote storage's pipe. + :param file: representation of the requested file + """ await self.connection.transfer(total=file.filesize) await sampling_required.put(self.connection) async def add(self, file: StoredFile, **kwargs): + """ + All files are contained in remote storage. Therefore no functionality to + adding files is provided. + """ raise NotImplementedError async def remove(self, file: StoredFile, **kwargs): + """ + All files are contained in remote storage. Therefore no functionality + to removing files is provided. + """ raise NotImplementedError def find(self, file: RequestedFile, **kwargs) -> LookUpInformation: + """ + All files are contained in remote storage. Therefore no functionality + to determine whether the storage cotains a certain file is provided. + """ raise NotImplementedError class StorageElement(Storage): + """ + The StorageElement object represents a local data storage or cache containing an + exact list of files and providing functionality to transfer and change the + storage's content. + """ __slots__ = ( "name", @@ -63,11 +98,27 @@ def __init__( size: int = 1000 * 1000 * 1000 * 1000, throughput_limit: int = 10 * 1000 * 1000 * 1000, files: Optional[dict] = None, + deletion_duration: float = 5, + update_duration: float = 1 ): + """ + Intialization of a storage element object. + + :param name: identification of the storage + :param sitename: + :param size: total size of the storage in bytes + :param throughput_limit: maximal bandwidth of the network connection to this + storage + :param files: dictionary of the files that are currently stored + :param deletion_duration: in seconds, amount of time passing while a file is + deleted from the storage + :param update_duration: in seconds, amount of time passing while a file's + information is updated + """ self.name = name self.sitename = sitename - self.deletion_duration = 5 - self.update_duration = 1 + self.deletion_duration = deletion_duration + self.update_duration = update_duration self._size = size self.files = files self._usedstorage = Resources( @@ -94,35 +145,26 @@ async def remove(self, file: StoredFile, job_repr=None): """ Deletes file from storage object. The time this operation takes is defined by the storages deletion_duration attribute. - :param file: + :param file: representation of the file that is removed from the storage :param job_repr: Needed for debug output, will be replaced :return: """ - print( - "REMOVE FROM STORAGE: Job {}, File {} @ {}".format( - job_repr, file.filename, time.now - ) - ) await (time + self.deletion_duration) await self._usedstorage.decrease(size=file.filesize) self.files.pop(file.filename) async def add(self, file: RequestedFile, job_repr=None): """ - Adds file to storage object transfering it through the storage objects + Adds file to storage object transferring it through the storage object's connection. This should be sufficient for now because files are only added - to the storage when they are also transfered through the Connections remote + to the storage when they are also transferred through the Connections remote connection. If this simulator is extended to include any kind of direct file placement this has to be adapted. - :param file: + :param file: representation of the file that is added to the storage :param job_repr: Needed for debug output, will be replaced :return: """ - print( - "ADD TO STORAGE: Job {}, File {} @ {}".format( - job_repr, file.filename, time.now - ) - ) + file = file.convert_to_stored_file_object(time.now) await self._usedstorage.increase(size=file.filesize) self.files[file.filename] = file @@ -138,11 +180,6 @@ async def _update(self, stored_file: StoredFile, job_repr): await (time + self.update_duration) stored_file.lastaccessed = time.now stored_file.increment_accesses() - print( - "UPDATE: Job {}, File {} @ {}".format( - job_repr, stored_file.filename, time.now - ) - ) async def transfer(self, file: RequestedFile, job_repr=None): """ @@ -181,6 +218,17 @@ def __repr__(self): class HitrateStorage(StorageElement): + """ + This class was used in early simulation concepts but is outdated now! + You're probably looking for FileBasedHitrateStorage instead! + + Simplified storage object, used to simulate a simplified form of hitrate based + caching. No explicit list of stored files is kept. Instead, it is assumed that a + fraction `_hitrate` of all files is stored. Every time a file is requested from + this kind of storage, `_hitrate` percent of the file are found on and transferred from this storage. + 1 - `_hitrate` percent of the file are transferred from the remote storage + associated to the hitrate storage. + """ def __init__( self, hitrate, @@ -208,6 +256,15 @@ def used(self): return 0 async def transfer(self, file: RequestedFile, job_repr=None): + """ + Every time a file is requested from this kind of storage, `_hitrate` percent + of the file are found on and transferred from this storage. + 1 - `_hitrate` percent of the file are transferred from the remote storage + associated to the hitrate storage. + :param file: + :param job_repr: + :return: + """ async with Scope() as scope: logging.getLogger("implementation").warning( "{} {} @ {} in {}".format( @@ -228,13 +285,30 @@ def find(self, requested_file: RequestedFile, job_repr=None): return LookUpInformation(requested_file.filesize, self) async def add(self, file: RequestedFile, job_repr=None): + """ + As files are not contained explicitly, no functionality to add files is + needed + """ pass async def remove(self, file: StoredFile, job_repr=None): + """ + As files are not contained explicitly, no functionality to remove files is + needed + """ pass class FileBasedHitrateStorage(StorageElement): + """ + Simplified storage object. There is no explicit list of contained files. + Instead, it is stated in file information (`RequestedFile_HitrateBased`) + whether this file is currently stored. Whether this is the case was determined in + the connection module's file transfer functionality. + The definition of the storage objects size is currently irrelevant. + + #TODO: this storage object has become very intermingled with the connection module and should be tidied up and restructured! + """ def __init__( self, name: Optional[str] = None, @@ -269,12 +343,27 @@ async def transfer(self, file: RequestedFile_HitrateBased, job_repr=None): raise ValueError def find(self, requested_file: RequestedFile_HitrateBased, job_repr=None): + """ + Returns the expectation value for the amount of data of this file that are + cached. + :param requested_file: + :param job_repr: + :return: + """ return LookUpInformation( requested_file.filesize * requested_file.cachehitrate, self ) async def add(self, file: RequestedFile, job_repr=None): + """ + As there is no explicit record of stored files, no functionality to add files is + needed + """ pass async def remove(self, file: StoredFile, job_repr=None): + """ + As there is no explicit record of stored files, no functionality to + remove files is needed + """ pass From 3b161206793a3c2c527d8a43dcd602ca89d85c2d Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Wed, 23 Sep 2020 09:21:46 +0200 Subject: [PATCH 573/648] Added draft for connection classes documentation --- lapis/connection.py | 31 ++++++++++++++++++++++++++++--- 1 file changed, 28 insertions(+), 3 deletions(-) diff --git a/lapis/connection.py b/lapis/connection.py index e594400..474d387 100644 --- a/lapis/connection.py +++ b/lapis/connection.py @@ -17,6 +17,20 @@ class Connection(object): + """ + Class that manages and triggers file transfers. It contains a mapping of + sitenames to storages in the `storages` dictionary and a global remote storage. + It can be used in file based and hitrate based caching mode, however the current + version is designed for hitrate based caching and the file based caching + functionality should be tested thoroughly before being activated. + + TODO:: this concept should be abolished, remote storages should be created based + on configs as normal storages. There should be an additional site class that + manages the mapping of storages and drones and the connection class should be + limited to managing and directing file transfers to the correct site, if this is + even necessary. Furthermore, the mechanics for choosing between caching scenarios + should be redesigned. + """ __slots__ = ( "storages", @@ -26,6 +40,11 @@ class Connection(object): ) def __init__(self, throughput=1000 * 1000 * 1000, filebased_caching=True): + """ + Intialization of the connection object + :param throughput: throughput of the connection's remote storage + :param filebased_caching: + """ self.storages = dict() self.remote_connection = RemoteStorage(MonitoredPipe(throughput=throughput)) self.caching_algorithm = CacheAlgorithm( @@ -38,9 +57,12 @@ def __init__(self, throughput=1000 * 1000 * 1000, filebased_caching=True): self._filebased_caching = filebased_caching async def run_pipemonitoring(self): + """ + Starts monitoring of pipe objects, should be called during simulator/monitoring + initialization. + """ async def report_load_to_monitoring(pipe: MonitoredPipe): async for information in pipe.load(): - # print(information) await sampling_required.put(information) async with Scope() as scope: @@ -51,7 +73,7 @@ async def report_load_to_monitoring(pipe: MonitoredPipe): def add_storage_element(self, storage_element: StorageElement): """ - Register storage element in Connetion module clustering storage elements by + Register storage element in Connetion module, clustering storage elements by sitename :param storage_element: :return: @@ -136,7 +158,7 @@ async def stream_file( async def transfer_files(self, drone, requested_files: dict, job_repr): """ Converts dict information about requested files to RequestedFile object and - sequentially streams all files + sequentially streams all files. If there is information about input files but no informations about the file size, th :param drone: :param requested_files: :param job_repr: @@ -160,6 +182,9 @@ async def transfer_files(self, drone, requested_files: dict, job_repr): except ZeroDivisionError: hitrate = 0 provides_file = 0 + #TODO:: In which cases is hitrate not defined and how can they be covered? I + # think that in this case this code should not be reached but I'm unsure + # right now await sampling_required.put( HitrateInfo( From 193def94050c3f70bdfe5843bb000973a01b8cec Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Tue, 29 Sep 2020 17:53:52 +0200 Subject: [PATCH 574/648] Moved scheduler without ClassAd compatibility into separate file to enhance readability. --- lapis/scheduler_withoutClassAds.py | 164 +++++++++++++++++++++++++++++ 1 file changed, 164 insertions(+) create mode 100644 lapis/scheduler_withoutClassAds.py diff --git a/lapis/scheduler_withoutClassAds.py b/lapis/scheduler_withoutClassAds.py new file mode 100644 index 0000000..83ceba7 --- /dev/null +++ b/lapis/scheduler_withoutClassAds.py @@ -0,0 +1,164 @@ +from typing import Dict + +from usim import Scope, interval, Resources + +from lapis.drone import Drone +from lapis.monitor import sampling_required +from lapis.monitor.duplicates import UserDemand + +from lapis.scheduler import JobScheduler + + +class CondorJobScheduler(JobScheduler): + """ + Goal of the htcondor job scheduler is to have a scheduler that somehow + mimics how htcondor does schedule jobs. + Htcondor does scheduling based on a priority queue. The priorities itself + are managed by operators of htcondor. + So different instances can apparently behave very different. + In my case I am going to try building a priority queue that sorts job slots + by increasing cost. The cost itself is calculated based on the current + strategy that is used at GridKa. The scheduler checks if a job either + exactly fits a slot or if it does fit into it several times. The cost for + putting a job at a given slot is given by the amount of resources that + might remain unallocated. + :return: + """ + + def __init__(self, job_queue): + self._stream_queue = job_queue + self.drone_cluster = [] + self.interval = 60 + self.job_queue = JobQueue() + self._collecting = True + self._processing = Resources(jobs=0) + + @property + def drone_list(self): + for cluster in self.drone_cluster: + for drone in cluster: + yield drone + + def register_drone(self, drone: Drone): + self._add_drone(drone) + + def unregister_drone(self, drone: Drone): + for cluster in self.drone_cluster: + try: + cluster.remove(drone) + except ValueError: + pass + else: + if len(cluster) == 0: + self.drone_cluster.remove(cluster) + + def _add_drone(self, drone: Drone, drone_resources: Dict = None): + minimum_distance_cluster = None + distance = float("Inf") + if len(self.drone_cluster) > 0: + for cluster in self.drone_cluster: + current_distance = 0 + for key in {*cluster[0].pool_resources, *drone.pool_resources}: + if drone_resources: + current_distance += abs( + cluster[0].theoretical_available_resources.get(key, 0) + - drone_resources.get(key, 0) + ) + else: + current_distance += abs( + cluster[0].theoretical_available_resources.get(key, 0) + - drone.theoretical_available_resources.get(key, 0) + ) + if current_distance < distance: + minimum_distance_cluster = cluster + distance = current_distance + if distance < 1: + minimum_distance_cluster.append(drone) + else: + self.drone_cluster.append([drone]) + else: + self.drone_cluster.append([drone]) + + def update_drone(self, drone: Drone): + self.unregister_drone(drone) + self._add_drone(drone) + + async def run(self): + async with Scope() as scope: + scope.do(self._collect_jobs()) + async for _ in interval(self.interval): + for job in self.job_queue.copy(): + best_match = self._schedule_job(job) + if best_match: + await best_match.schedule_job(job) + self.job_queue.remove(job) + await sampling_required.put(self.job_queue) + await sampling_required.put(UserDemand(len(self.job_queue))) + self.unregister_drone(best_match) + left_resources = best_match.theoretical_available_resources + left_resources = { + key: value - job.resources.get(key, 0) + for key, value in left_resources.items() + } + self._add_drone(best_match, left_resources) + if ( + not self._collecting + and not self.job_queue + and self._processing.levels.jobs == 0 + ): + break + await sampling_required.put(self) + + async def _collect_jobs(self): + async for job in self._stream_queue: + self.job_queue.append(job) + await self._processing.increase(jobs=1) + # TODO: logging happens with each job + await sampling_required.put(self.job_queue) + await sampling_required.put(UserDemand(len(self.job_queue))) + self._collecting = False + + async def job_finished(self, job): + if job.successful: + await self._processing.decrease(jobs=1) + else: + await self._stream_queue.put(job) + + def _schedule_job(self, job) -> Drone: + priorities = {} + for cluster in self.drone_cluster: + drone = cluster[0] + cost = 0 + resources = drone.theoretical_available_resources + for resource_type in job.resources: + if resources.get(resource_type, 0) < job.resources[resource_type]: + # Inf for all job resources that a drone does not support + # and all resources that are too small to even be considered + cost = float("Inf") + break + else: + try: + cost += 1 / ( + resources[resource_type] // job.resources[resource_type] + ) + except KeyError: + pass + for additional_resource_type in [ + key for key in drone.pool_resources if key not in job.resources + ]: + cost += resources[additional_resource_type] + cost /= len((*job.resources, *drone.pool_resources)) + if cost <= 1: + # directly start job + return drone + try: + priorities[cost].append(drone) + except KeyError: + priorities[cost] = [drone] + try: + minimal_key = min(priorities) + if minimal_key < float("Inf"): + return priorities[minimal_key][0] + except ValueError: + pass + return None From 974c77770600cedb642c5437bab8019957ef8f24 Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Tue, 29 Sep 2020 17:55:15 +0200 Subject: [PATCH 575/648] Added draft for scheduler class documentation --- lapis/scheduler.py | 330 ++++++++++++++++++++++++--------------------- 1 file changed, 175 insertions(+), 155 deletions(-) diff --git a/lapis/scheduler.py b/lapis/scheduler.py index 40e0024..e142abb 100644 --- a/lapis/scheduler.py +++ b/lapis/scheduler.py @@ -25,13 +25,13 @@ class JobQueue(list): pass - quantization_defaults = { "memory": HTCInt(128 * 1024 * 1024), "disk": HTCInt(1024 * 1024), "cores": HTCInt(1), } +# ClassAd attributes are not case sensitive machine_ad_defaults = """ requirements = target.requestcpus <= my.cpus """.strip() @@ -45,16 +45,29 @@ class JobQueue(list): class WrappedClassAd(ClassAd, Generic[DJ]): + """ + Combines the original job/drone object and the associated ClassAd. + """ __slots__ = "_wrapped", "_temp" def __init__(self, classad: ClassAd, wrapped: DJ): + """ + Initialization for wrapped ClassAd + :param classad: the wrapped objects ClassAd description + :param wrapped: wrapped object, either job or drone + """ super(WrappedClassAd, self).__init__() self._wrapped = wrapped self._data = classad._data self._temp = {} def empty(self): + """ + Only relevant for wrapped drones to determine whether there are jobs running + on them. If this is the case the amount of cores in usage is >= 1. + :return: true if no CPU cores are in use, false if this is not the case + """ try: return self._temp["cores"] < 1 except KeyError: @@ -201,159 +214,6 @@ async def job_finished(self, job): raise NotImplementedError -class CondorJobScheduler(JobScheduler): - """ - Goal of the htcondor job scheduler is to have a scheduler that somehow - mimics how htcondor does schedule jobs. - Htcondor does scheduling based on a priority queue. The priorities itself - are managed by operators of htcondor. - So different instances can apparently behave very different. - In my case I am going to try building a priority queue that sorts job slots - by increasing cost. The cost itself is calculated based on the current - strategy that is used at GridKa. The scheduler checks if a job either - exactly fits a slot or if it does fit into it several times. The cost for - putting a job at a given slot is given by the amount of resources that - might remain unallocated. - :return: - """ - - def __init__(self, job_queue): - self._stream_queue = job_queue - self.drone_cluster = [] - self.interval = 60 - self.job_queue = JobQueue() - self._collecting = True - self._processing = Resources(jobs=0) - - @property - def drone_list(self): - for cluster in self.drone_cluster: - for drone in cluster: - yield drone - - def register_drone(self, drone: Drone): - self._add_drone(drone) - - def unregister_drone(self, drone: Drone): - for cluster in self.drone_cluster: - try: - cluster.remove(drone) - except ValueError: - pass - else: - if len(cluster) == 0: - self.drone_cluster.remove(cluster) - - def _add_drone(self, drone: Drone, drone_resources: Dict = None): - minimum_distance_cluster = None - distance = float("Inf") - if len(self.drone_cluster) > 0: - for cluster in self.drone_cluster: - current_distance = 0 - for key in {*cluster[0].pool_resources, *drone.pool_resources}: - if drone_resources: - current_distance += abs( - cluster[0].theoretical_available_resources.get(key, 0) - - drone_resources.get(key, 0) - ) - else: - current_distance += abs( - cluster[0].theoretical_available_resources.get(key, 0) - - drone.theoretical_available_resources.get(key, 0) - ) - if current_distance < distance: - minimum_distance_cluster = cluster - distance = current_distance - if distance < 1: - minimum_distance_cluster.append(drone) - else: - self.drone_cluster.append([drone]) - else: - self.drone_cluster.append([drone]) - - def update_drone(self, drone: Drone): - self.unregister_drone(drone) - self._add_drone(drone) - - async def run(self): - async with Scope() as scope: - scope.do(self._collect_jobs()) - async for _ in interval(self.interval): - for job in self.job_queue.copy(): - best_match = self._schedule_job(job) - if best_match: - await best_match.schedule_job(job) - self.job_queue.remove(job) - await sampling_required.put(self.job_queue) - await sampling_required.put(UserDemand(len(self.job_queue))) - self.unregister_drone(best_match) - left_resources = best_match.theoretical_available_resources - left_resources = { - key: value - job.resources.get(key, 0) - for key, value in left_resources.items() - } - self._add_drone(best_match, left_resources) - if ( - not self._collecting - and not self.job_queue - and self._processing.levels.jobs == 0 - ): - break - await sampling_required.put(self) - - async def _collect_jobs(self): - async for job in self._stream_queue: - self.job_queue.append(job) - await self._processing.increase(jobs=1) - # TODO: logging happens with each job - await sampling_required.put(self.job_queue) - await sampling_required.put(UserDemand(len(self.job_queue))) - self._collecting = False - - async def job_finished(self, job): - if job.successful: - await self._processing.decrease(jobs=1) - else: - await self._stream_queue.put(job) - - def _schedule_job(self, job) -> Drone: - priorities = {} - for cluster in self.drone_cluster: - drone = cluster[0] - cost = 0 - resources = drone.theoretical_available_resources - for resource_type in job.resources: - if resources.get(resource_type, 0) < job.resources[resource_type]: - # Inf for all job resources that a drone does not support - # and all resources that are too small to even be considered - cost = float("Inf") - break - else: - try: - cost += 1 / ( - resources[resource_type] // job.resources[resource_type] - ) - except KeyError: - pass - for additional_resource_type in [ - key for key in drone.pool_resources if key not in job.resources - ]: - cost += resources[additional_resource_type] - cost /= len((*job.resources, *drone.pool_resources)) - if cost <= 1: - # directly start job - return drone - try: - priorities[cost].append(drone) - except KeyError: - priorities[cost] = [drone] - try: - minimal_key = min(priorities) - if minimal_key < float("Inf"): - return priorities[minimal_key][0] - except ValueError: - pass - return None # HTCondor ClassAd Scheduler @@ -426,12 +286,21 @@ class RankedAutoClusters(RankedClusters[DJ]): """Automatically cluster similar jobs or drones""" def __init__(self, quantization: Dict[str, HTCInt], ranking: Expression): + """ + :param quantization: + :param ranking: prejobrank expression + """ self._quantization = quantization self._ranking = ranking self._clusters: Dict[RankedClusterKey, Set[WrappedClassAd[DJ]]] = SortedDict() self._inverse: Dict[WrappedClassAd[DJ], RankedClusterKey] = {} def empty(self) -> bool: + """ + Checks whether all drones in the RankedCluster are empty and currently not + running any jobs. + :return: + """ for drones in self._clusters.values(): if not next(iter(drones)).empty(): return False @@ -446,6 +315,15 @@ def copy(self) -> "RankedAutoClusters[DJ]": return clone def add(self, item: WrappedClassAd[DJ]): + """ + Add a new wrapped item, usually a drone, to the RankedAutoCluster. + Unless the item is already contained, the item's key is generated and it is + sorted in into the clusters accordingly. If there are already items with the + same key, the new item is added to the existing cluster. If not, + a new cluster is created. + :param item: + :return: + """ if item in self._inverse: raise ValueError(f"{item!r} already stored; use `.update(item)` instead") item_key = self._clustering_key(item) @@ -456,6 +334,11 @@ def add(self, item: WrappedClassAd[DJ]): self._inverse[item] = item_key def remove(self, item: WrappedClassAd[DJ]): + """ + Removes the item. + :param item: + :return: + """ item_key = self._inverse.pop(item) cluster = self._clusters[item_key] cluster.remove(item) @@ -463,6 +346,15 @@ def remove(self, item: WrappedClassAd[DJ]): del self._clusters[item_key] def _clustering_key(self, item: WrappedClassAd[DJ]): + """ + Calculates an item's clustering key based on the specified ranking (in my use + case the prejobrank) and the item's available resource. The resulting key's + structure is (prejobrank value, (available cpus, available memory, available + disk space)) + :param item: drone for which the clustering key is calculated. + :return: (prejobrank value, (available cpus, available memory, available + disk space)) + """ # TODO: assert that order is consistent quantization = self._quantization return RankedClusterKey( @@ -474,12 +366,23 @@ def _clustering_key(self, item: WrappedClassAd[DJ]): ) def clusters(self) -> Iterator[Set[WrappedClassAd[DJ]]]: + """ + :return: iterator of all clusters + """ return iter(self._clusters.values()) def items(self) -> Iterator[Tuple[RankedClusterKey, Set[WrappedClassAd[DJ]]]]: + """ + :return: iterator of all clusters and corresponding keys + """ return iter(self._clusters.items()) def cluster_groups(self) -> Iterator[List[Set[WrappedClassAd[Drone]]]]: + """ + Sort clusters by the ranking key and then by the amount of available + resources into nested lists of sets. + :return: + """ group = [] current_rank = None for ranked_key, drones in self._clusters.items(): @@ -579,6 +482,8 @@ class CondorClassadJobScheduler(JobScheduler): putting a job at a given slot is given by the amount of resources that might remain unallocated. :return: + + - aktuell nur pre job rank und job rank, kein post job rank implementiert """ def __init__( @@ -590,6 +495,15 @@ def __init__( interval: float = 60, autocluster: bool = False, ): + """ + Initializes the CondorClassadJobScheduler + :param job_queue: queue of jobs that are scheduled in the following simulation + :param machine_ad: ClassAd that is used with every drone + :param job_ad: ClassAd that is used with every job + :param pre_job_rank: ClassAd attribute that all drones are sorted by + :param interval: time between scheduling cycles + :param autocluster: could be used to decide whether to use autoclusters + """ self._stream_queue = job_queue self._drones: RankedClusters[Drone] = RankedNonClusters( quantization=quantization_defaults, ranking=parse(pre_job_rank) @@ -606,24 +520,51 @@ def __init__( @property def drone_list(self) -> Iterator[Drone]: + """ + Takes an iterator over the WrappedClassAd objects of drones known to the + scheduler, extracts the drones and returns an iterator over the drone objects. + :return: + """ for cluster in self._drones.clusters(): for drone in cluster: yield drone._wrapped def register_drone(self, drone: Drone): + """ + Provides the drones with the drone ClassAd, combines both into one object and + adds the resulting WrappedClassAd object to the drones known to the scheduler as + well as the dictionary containing all WrappedClassAd objects the scheduler + works with. + :param drone: + """ wrapped_drone = WrappedClassAd(classad=self._machine_classad, wrapped=drone) - self._wrapped_classads[drone] = wrapped_drone self._drones.add(wrapped_drone) + self._wrapped_classads[drone] = wrapped_drone def unregister_drone(self, drone: Drone): + """ + Remove a drone's representation from the scheduler's scope. + :param drone: + :return: + """ drone_wrapper = self._wrapped_classads[drone] self._drones.remove(drone_wrapper) def update_drone(self, drone: Drone): + """ + Update a drone's representation in the scheduler scope. + :param drone: + :return: + """ drone_wrapper = self._wrapped_classads[drone] self._drones.update(drone_wrapper) async def run(self): + """ + Runs the scheduler's functionality. One executed, the scheduler starts up and + begins to add the jobs that are + :return: + """ async with Scope() as scope: scope.do(self._collect_jobs()) async for _ in interval(self.interval): @@ -639,7 +580,44 @@ async def run(self): def _match_job( job: ClassAd, pre_job_clusters: Iterator[List[Set[WrappedClassAd[Drone]]]] ): + """ + Tries to find a match for the transferred job among the available drones. + :param job: job to match + :param pre_job_clusters: list of clusters of wrapped drones that are + presorted by a clustering mechanism of RankedAutoClusters/RankedNonClusters + that mimics the HTCondor NEGOTIATOR_PRE_JOB_RANK, short prejobrank. The + clusters contain drones that are considered to be equivalent with respect to all + Requirements and Ranks + that are used during the matchmaking process. This mimics the Autoclustering + functionality of HTCondor. + [[highest prejobrank {autocluster}, {autocluster}], ..., [lowest prejobrank { + autocluster}, {autocluster}] + :return: drone that is the best match for the job + + The matching is performed in several steps: + 1. The job's requirements are evaluted and only drones that meet them are + considered further. A drone of every autocluster is extracted from + pre_job_clusters and if it meets the job's requirements it is not removed + from pre_job_clusters. + 2. The autoclusters that are equivalent with respect to the prejobrank are + then sorted by the job's rank expression. The resulting format of + pre_job_clusters is + [[(highest prejobrank, highest jobrank) {autocluster} {autocluster}, + ..., (highest prejobrank, lowest jobrank) {autocluster}], ...] + 3. The resulting pre_job_clusters are then iterated and the drone with the + highest (prejobrank, jobrank) whose requirements are also compatible with the + job is returned as best match. + """ def debug_evaluate(expr, my, target=None): + """ + Reimplementation of the classad packages evaluate function. Having it + here enables developers to inspect the ClassAd evaluation process more + closely and to add debug output if necessary. + :param expr: + :param my: + :param target: + :return: + """ if type(expr) is str: expr = my[expr] result = expr.evaluate(my=my, target=target) @@ -684,6 +662,25 @@ def debug_evaluate(expr, my, target=None): raise NoMatch() async def _schedule_jobs(self): + """ + Handles the scheduling of jobs. Tried to match the jobs in the job queue to + available resources. This occurs in several steps. + 1. The list of drones known to the scheduler is copied. The copy can then be + used to keep track of the drones' available resources while matching jobs as + the jobs allocate resources on the original drones before being processed but + not during scheduling. + 2. The job in the job queue are matched to (the copied)resources iteratively. + The actual matching is performed by the `_match_job` method that returns the most + suitable drone unless no drone is compatible with the job's requirements. + If a match was found, the resources requested by the job are allocated on the matched drone. + If no resources remain unallocated after the last job's allocation, + the matching process is ended for this scheduler interval. + 3. After the job matching is finished, the matched jobs are removed from the + job queue as the index of a job in the job queue changes once a job with a + lower index is removed from the queue. + 4. The matched jobs' execution is triggered. + :return: + """ # Pre Job Rank is the same for all jobs # Use a copy to allow temporary "remainder after match" estimates if self._drones.empty(): @@ -710,6 +707,8 @@ async def _schedule_jobs(self): - value ) pre_job_drones.update(matched_drone) + + # monitoring/coordination stuff if ( candidate_job._wrapped._total_input_data and matched_drone._wrapped.cached_data @@ -733,11 +732,24 @@ async def _schedule_jobs(self): await sampling_required.put(UserDemand(len(self.job_queue))) async def _execute_job(self, job: WrappedClassAd, drone: WrappedClassAd): + """ + Schedules a job on a drone by extracting both objects from the + respective WrappedClassAd and using the drone's scheduling functionality + :param job: + :param drone: + :return: + """ wrapped_job = job._wrapped wrapped_drone = drone._wrapped await wrapped_drone.schedule_job(wrapped_job) async def _collect_jobs(self): + """ + Combines jobs that are imported from the simulation's job config with a job + ClassAd and adds the resulting WrappedClassAd objects to the scheduler's job + queue. + :return: + """ async for job in self._stream_queue: wrapped_job = WrappedClassAd(classad=self._job_classad, wrapped=job) self._wrapped_classads[job] = wrapped_job @@ -750,6 +762,14 @@ async def _collect_jobs(self): self._collecting = False async def job_finished(self, job): + """ + Handles the impact of finishing jobs on the scheduler. If the job is completed + successfully, the amount of running jobs matched by the current scheduler + instance is reduced. If the job is not finished successfully, + it is resubmitted to the scheduler's job queue. + :param job: + :return: + """ if job.successful: await self._processing.decrease(jobs=1) else: From e06a8d7d658b7840ad036adb4f8857b148fac0bd Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Wed, 30 Sep 2020 08:18:54 +0200 Subject: [PATCH 576/648] Added template for unit tests of the job class when supporting caching --- lapis_tests/test_job_caching.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) create mode 100644 lapis_tests/test_job_caching.py diff --git a/lapis_tests/test_job_caching.py b/lapis_tests/test_job_caching.py new file mode 100644 index 0000000..d0d01b3 --- /dev/null +++ b/lapis_tests/test_job_caching.py @@ -0,0 +1,16 @@ +import pytest +from usim import Scope, time + +from lapis.drone import Drone +from lapis.job import Job + +from lapis_tests import via_usim, DummyScheduler, DummyDrone +from lapis.connection import Connection + + +class TestJobCaching(object): + def test_calculation_time(self): + pass + + def test_transfer_time(self): + pass From 3cc44f3c68e6ebde721b1cfc0fc4fa473785ba30 Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Wed, 30 Sep 2020 09:28:05 +0200 Subject: [PATCH 577/648] Updated drone documentation --- lapis/drone.py | 63 ++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 56 insertions(+), 7 deletions(-) diff --git a/lapis/drone.py b/lapis/drone.py index 4d2efdb..f25a1b3 100644 --- a/lapis/drone.py +++ b/lapis/drone.py @@ -14,6 +14,9 @@ class ResourcesExceeded(Exception): class Drone(interfaces.Pool): + """ + Represents worker nodes in the simulation. + """ def __init__( self, scheduler, @@ -28,9 +31,15 @@ def __init__( < 1, ): """ - :param scheduler: - :param pool_resources: - :param scheduling_duration: + Drone initialization + :param scheduler: scheduler that assigns jobs to the drone + :param pool_resources: dict of the drone's resources + :param scheduling_duration: amount of time that passes between the drone's + start up and it's registration at the scheduler + :param ignore_resources: dict of the resource keys that are ignored, e.g. "disk" + :param sitename: identifier, used to determine which caches a drone can use + :param connection: connection object that holds remote connection and handles file transfers + :param empty: callable that determines whether the drone is currently running any jobs """ super(Drone, self).__init__() self.scheduler = scheduler @@ -62,17 +71,37 @@ def __init__( self.cached_data = 0 def empty(self): + """ + Checks whether there are any jobs running on this drone + :return: true if no jobs are running on this drone, false else + """ return self._empty(self) @property def theoretical_available_resources(self): + """ + Returns the amount of resources of the drone that were available if all jobs + used exactly the amount of resources they requested + :return: + """ return dict(self.resources.levels) @property def available_resources(self): + """ + Returns the amount of resources of the drone that are available based on the + amount of resources the running jobs actually use. + :return: + """ return dict(self.used_resources.levels) async def run(self): + """ + Handles the drone's activity during simulation. Upon execution the drone + registers itself at the scheduler and once jobs are scheduled to the drone's + job queue, these jobs are executed. Starting jobs via a job queue was + introduced to avoid errors in resource allocation and monitoring. + """ from lapis.monitor import sampling_required await (time + self.scheduling_duration) @@ -132,6 +161,9 @@ def _init_allocation_and_utilisation(self): self._utilisation = min(resources) async def shutdown(self): + """ + Upon shutdown, the drone unregisters from the scheduler. + """ from lapis.monitor import sampling_required self._supply = 0 @@ -149,18 +181,26 @@ async def shutdown(self): await (time + 1) async def schedule_job(self, job: Job, kill: bool = False): + """ + A job is scheduled to a drone by putting it in the drone's job queue. + :param job: job that was matched to the drone + :param kill: flag, if true jobs can be killed if they use more resources than they requested + """ await self._job_queue.put((job, kill)) async def _run_job(self, job: Job, kill: bool): """ Method manages to start a job in the context of the given drone. - The job is started independent of available resources. If resources of - drone are exceeded, the job is killed. - + The job is started regardless of the available resources. The resource + allocation takes place after starting the job and the job is killed if the + drone's overall resources are exceeded. In addition, if the `kill` flag is + set, jobs are killed if the resources they use exceed the resources they + requested. + Then the end of the job's execution is awaited and the drones status + known to the scheduler is changed. :param job: the job to start :param kill: if True, a job is killed when used resources exceed requested resources - :return: """ job.drone = self async with Scope() as scope: @@ -224,6 +264,15 @@ async def _run_job(self, job: Job, kill: bool): ) def look_up_cached_data(self, job: Job): + """ + Determines the amount of the job's input data that is stored in caches the + drone can access and sets the drone's `cached_data` attribute to the + resulting value. This quantity can then be used in the job matching process. + *Pay attention to the fact that the current implementation only works for + hitrate based caching and that while KeyErrors should not occur due to the + way the method is called, KeyErrors are not handled here.* + :param job: + """ cached_data = 0 caches = self.connection.storages.get(self.sitename, None) if caches: From f87ededfb2dc117aa7345c5348b0b95d52edbb08 Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Wed, 30 Sep 2020 09:46:37 +0200 Subject: [PATCH 578/648] Updated scheduler documentation --- lapis/scheduler.py | 30 ++++++++++++++++++++++++++---- 1 file changed, 26 insertions(+), 4 deletions(-) diff --git a/lapis/scheduler.py b/lapis/scheduler.py index e142abb..7bc7d76 100644 --- a/lapis/scheduler.py +++ b/lapis/scheduler.py @@ -74,7 +74,20 @@ def empty(self): return self._wrapped.theoretical_available_resources["cores"] < 1 def __getitem__(self, item): + """ + This method is used when evaluating classad expressions. + :param item: name of a quantity in the classad expression + :return: current value of this item + """ def access_wrapped(name, requested=True): + """ + Extracts the wrapped object's current quantity of a certain resource ( + cores, memory, disk) + :param name: name of the reosurce that is to be accessed + :param requested: false if name is a resource of the drone, true if name + is a resource requested by a job + :return: value of respective resource + """ if isinstance(self._wrapped, Drone): return self._wrapped.theoretical_available_resources[name] if requested: @@ -350,7 +363,8 @@ def _clustering_key(self, item: WrappedClassAd[DJ]): Calculates an item's clustering key based on the specified ranking (in my use case the prejobrank) and the item's available resource. The resulting key's structure is (prejobrank value, (available cpus, available memory, available - disk space)) + disk space)). The clustering key is negative as the SortedDict sorts its entries + from low keys to high keys. :param item: drone for which the clustering key is calculated. :return: (prejobrank value, (available cpus, available memory, available disk space)) @@ -449,6 +463,11 @@ def update(self, item): self.add(item) def _clustering_key(self, item: WrappedClassAd[DJ]): + """ + For RankNonClusters there is only one clustering key, the objects defined + ranking. The clustering key is negative as the SortedDict sorts its entries + from low keys to high keys. + """ return -1.0 * self._ranking.evaluate(my=item) def clusters(self) -> Iterator[Set[WrappedClassAd[DJ]]]: @@ -458,7 +477,12 @@ def items(self) -> Iterator[Tuple[float, Set[WrappedClassAd[DJ]]]]: return iter(self._clusters.items()) def cluster_groups(self) -> Iterator[List[Set[WrappedClassAd[Drone]]]]: - + """ + Sorts cluster by the ranking key. As there is no autoclustering, every drone + is in a dedicated set and drones of the same ranking are combined into a list. + These lists are then sorted by increasing ranking. + :return: iterator of the lists containing drones with identical key + """ for _ranked_key, drones in self._clusters.items(): yield [{item} for item in drones] @@ -482,8 +506,6 @@ class CondorClassadJobScheduler(JobScheduler): putting a job at a given slot is given by the amount of resources that might remain unallocated. :return: - - - aktuell nur pre job rank und job rank, kein post job rank implementiert """ def __init__( From 1f70388f20d619a2ee597b2990bca7cd3400a592 Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Wed, 30 Sep 2020 11:16:05 +0200 Subject: [PATCH 579/648] Updated files module documentation --- lapis/files.py | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/lapis/files.py b/lapis/files.py index b43c8fb..1f65a47 100644 --- a/lapis/files.py +++ b/lapis/files.py @@ -2,6 +2,10 @@ class StoredFile(object): + """ + Object representing stored files + + """ __slots__ = ( "filename", @@ -22,22 +26,51 @@ def __init__( numberofaccesses: Optional[int] = None, **filespecs, ): + """ + Intialization of a stored file + :param filename: name of the file + :param filesize: size of the file + :param storedsize: size of the file that is actually stored, necessary if + less than the whole file is stored + :param cachedsince: time when the file was cached + :param lastaccessed: time when the file was accessed the last time + :param numberofaccesses: number of times the file was accessed + """ self.filename = filename + """name of the file """ self.filesize = filesize + """size of the file""" self.storedsize = storedsize or self.filesize + """size of the file that is actually stored""" self.cachedsince = cachedsince + """point in time when the file was cached""" self.lastaccessed = lastaccessed + """time when the file was accessed the last time""" self.numberofaccesses = numberofaccesses + """number of times the file was accessed""" def increment_accesses(self): + """ + Increments number of accesses of a file + """ self.numberofaccesses += 1 class RequestedFile(NamedTuple): + """ + Representation of a requested file + """ filename: str + """name of the file""" filesize: Optional[int] = None + """size of the file""" def convert_to_stored_file_object(self, currenttime): + """ + Converts a requested file into a stored file + + :param currenttime: point in time when the conversion takes place + """ print(self.filesize) return StoredFile( self.filename, @@ -49,6 +82,14 @@ def convert_to_stored_file_object(self, currenttime): class RequestedFile_HitrateBased(NamedTuple): + """ + Represents a requested file in hitrate based caching. + The cachehitrate flag is somewhat messed up currently. + **Its use should be reworked when remodeling the connection module.** + """ filename: str + """name of the requested file""" filesize: int + """size of the requested file""" cachehitrate: int + """flag whether the file is cached, 1 if it is cached, 0 if it is not cached""" From 463f4188244091289cb2a2bad4c214e3e872b789 Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Wed, 30 Sep 2020 11:17:29 +0200 Subject: [PATCH 580/648] Tried to enable autosummary for lapis --- docs/conf.py | 8 ++++++++ docs/index.rst | 11 +++++++++++ 2 files changed, 19 insertions(+) diff --git a/docs/conf.py b/docs/conf.py index db8cc29..60c0e41 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -44,6 +44,8 @@ # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. extensions = [ +# "sphinx_automodapi.automodapi", + "sphinx.ext.autosummary", "sphinx.ext.autodoc", "sphinx.ext.intersphinx", "sphinx.ext.todo", @@ -53,6 +55,12 @@ "sphinx_click.ext", ] + +autodoc_default_flags = ['members'] +autosummary_generate = True +autodoc_mock_imports = ["cobald", "usim", "sortedcontainers", "classad"] + + # Add any paths that contain templates here, relative to this directory. templates_path = ["_templates"] diff --git a/docs/index.rst b/docs/index.rst index 5124ee9..996af2f 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -92,6 +92,17 @@ Indices and tables * :ref:`modindex` * :ref:`search` +Class overview +-------------- + + +.. autosummary:: + :toctree: stubs + + lapis.files + lapis.job + + .. _HTCondor: https://research.cs.wisc.edu/htcondor/ .. _COBalD: https://cobald.readthedocs.io/en/latest/ .. _TARDIS: https://cobald-tardis.readthedocs.io/en/latest From 673b472266821ad79f7613b06aacb9cfb61eb529 Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Wed, 30 Sep 2020 11:39:51 +0200 Subject: [PATCH 581/648] Moved CondorJobScheduler back into scheduler module due to documentation issues --- lapis/scheduler.py | 152 +++++++++++++++++++++++++++++ lapis/scheduler_withoutClassAds.py | 152 ----------------------------- 2 files changed, 152 insertions(+), 152 deletions(-) diff --git a/lapis/scheduler.py b/lapis/scheduler.py index 7bc7d76..4d17643 100644 --- a/lapis/scheduler.py +++ b/lapis/scheduler.py @@ -226,7 +226,159 @@ async def job_finished(self, job): """ raise NotImplementedError +class CondorJobScheduler(JobScheduler): + """ + Goal of the htcondor job scheduler is to have a scheduler that somehow + mimics how htcondor does schedule jobs. + Htcondor does scheduling based on a priority queue. The priorities itself + are managed by operators of htcondor. + So different instances can apparently behave very different. + In my case I am going to try building a priority queue that sorts job slots + by increasing cost. The cost itself is calculated based on the current + strategy that is used at GridKa. The scheduler checks if a job either + exactly fits a slot or if it does fit into it several times. The cost for + putting a job at a given slot is given by the amount of resources that + might remain unallocated. + :return: + """ + def __init__(self, job_queue): + self._stream_queue = job_queue + self.drone_cluster = [] + self.interval = 60 + self.job_queue = JobQueue() + self._collecting = True + self._processing = Resources(jobs=0) + + @property + def drone_list(self): + for cluster in self.drone_cluster: + for drone in cluster: + yield drone + + def register_drone(self, drone: Drone): + self._add_drone(drone) + + def unregister_drone(self, drone: Drone): + for cluster in self.drone_cluster: + try: + cluster.remove(drone) + except ValueError: + pass + else: + if len(cluster) == 0: + self.drone_cluster.remove(cluster) + + def _add_drone(self, drone: Drone, drone_resources: Dict = None): + minimum_distance_cluster = None + distance = float("Inf") + if len(self.drone_cluster) > 0: + for cluster in self.drone_cluster: + current_distance = 0 + for key in {*cluster[0].pool_resources, *drone.pool_resources}: + if drone_resources: + current_distance += abs( + cluster[0].theoretical_available_resources.get(key, 0) + - drone_resources.get(key, 0) + ) + else: + current_distance += abs( + cluster[0].theoretical_available_resources.get(key, 0) + - drone.theoretical_available_resources.get(key, 0) + ) + if current_distance < distance: + minimum_distance_cluster = cluster + distance = current_distance + if distance < 1: + minimum_distance_cluster.append(drone) + else: + self.drone_cluster.append([drone]) + else: + self.drone_cluster.append([drone]) + + def update_drone(self, drone: Drone): + self.unregister_drone(drone) + self._add_drone(drone) + + async def run(self): + async with Scope() as scope: + scope.do(self._collect_jobs()) + async for _ in interval(self.interval): + for job in self.job_queue.copy(): + best_match = self._schedule_job(job) + if best_match: + await best_match.schedule_job(job) + self.job_queue.remove(job) + await sampling_required.put(self.job_queue) + await sampling_required.put(UserDemand(len(self.job_queue))) + self.unregister_drone(best_match) + left_resources = best_match.theoretical_available_resources + left_resources = { + key: value - job.resources.get(key, 0) + for key, value in left_resources.items() + } + self._add_drone(best_match, left_resources) + if ( + not self._collecting + and not self.job_queue + and self._processing.levels.jobs == 0 + ): + break + await sampling_required.put(self) + + async def _collect_jobs(self): + async for job in self._stream_queue: + self.job_queue.append(job) + await self._processing.increase(jobs=1) + # TODO: logging happens with each job + await sampling_required.put(self.job_queue) + await sampling_required.put(UserDemand(len(self.job_queue))) + self._collecting = False + + async def job_finished(self, job): + if job.successful: + await self._processing.decrease(jobs=1) + else: + await self._stream_queue.put(job) + + def _schedule_job(self, job) -> Drone: + priorities = {} + for cluster in self.drone_cluster: + drone = cluster[0] + cost = 0 + resources = drone.theoretical_available_resources + for resource_type in job.resources: + if resources.get(resource_type, 0) < job.resources[resource_type]: + # Inf for all job resources that a drone does not support + # and all resources that are too small to even be considered + cost = float("Inf") + break + else: + try: + cost += 1 / ( + resources[resource_type] // job.resources[resource_type] + ) + except KeyError: + pass + for additional_resource_type in [ + key for key in drone.pool_resources if key not in job.resources + ]: + cost += resources[additional_resource_type] + cost /= len((*job.resources, *drone.pool_resources)) + if cost <= 1: + # directly start job + return drone + try: + priorities[cost].append(drone) + except KeyError: + priorities[cost] = [drone] + try: + minimal_key = min(priorities) + if minimal_key < float("Inf"): + return priorities[minimal_key][0] + except ValueError: + pass + return None # HTCondor ClassAd Scheduler diff --git a/lapis/scheduler_withoutClassAds.py b/lapis/scheduler_withoutClassAds.py index 83ceba7..9441c77 100644 --- a/lapis/scheduler_withoutClassAds.py +++ b/lapis/scheduler_withoutClassAds.py @@ -9,156 +9,4 @@ from lapis.scheduler import JobScheduler -class CondorJobScheduler(JobScheduler): - """ - Goal of the htcondor job scheduler is to have a scheduler that somehow - mimics how htcondor does schedule jobs. - Htcondor does scheduling based on a priority queue. The priorities itself - are managed by operators of htcondor. - So different instances can apparently behave very different. - In my case I am going to try building a priority queue that sorts job slots - by increasing cost. The cost itself is calculated based on the current - strategy that is used at GridKa. The scheduler checks if a job either - exactly fits a slot or if it does fit into it several times. The cost for - putting a job at a given slot is given by the amount of resources that - might remain unallocated. - :return: - """ - def __init__(self, job_queue): - self._stream_queue = job_queue - self.drone_cluster = [] - self.interval = 60 - self.job_queue = JobQueue() - self._collecting = True - self._processing = Resources(jobs=0) - - @property - def drone_list(self): - for cluster in self.drone_cluster: - for drone in cluster: - yield drone - - def register_drone(self, drone: Drone): - self._add_drone(drone) - - def unregister_drone(self, drone: Drone): - for cluster in self.drone_cluster: - try: - cluster.remove(drone) - except ValueError: - pass - else: - if len(cluster) == 0: - self.drone_cluster.remove(cluster) - - def _add_drone(self, drone: Drone, drone_resources: Dict = None): - minimum_distance_cluster = None - distance = float("Inf") - if len(self.drone_cluster) > 0: - for cluster in self.drone_cluster: - current_distance = 0 - for key in {*cluster[0].pool_resources, *drone.pool_resources}: - if drone_resources: - current_distance += abs( - cluster[0].theoretical_available_resources.get(key, 0) - - drone_resources.get(key, 0) - ) - else: - current_distance += abs( - cluster[0].theoretical_available_resources.get(key, 0) - - drone.theoretical_available_resources.get(key, 0) - ) - if current_distance < distance: - minimum_distance_cluster = cluster - distance = current_distance - if distance < 1: - minimum_distance_cluster.append(drone) - else: - self.drone_cluster.append([drone]) - else: - self.drone_cluster.append([drone]) - - def update_drone(self, drone: Drone): - self.unregister_drone(drone) - self._add_drone(drone) - - async def run(self): - async with Scope() as scope: - scope.do(self._collect_jobs()) - async for _ in interval(self.interval): - for job in self.job_queue.copy(): - best_match = self._schedule_job(job) - if best_match: - await best_match.schedule_job(job) - self.job_queue.remove(job) - await sampling_required.put(self.job_queue) - await sampling_required.put(UserDemand(len(self.job_queue))) - self.unregister_drone(best_match) - left_resources = best_match.theoretical_available_resources - left_resources = { - key: value - job.resources.get(key, 0) - for key, value in left_resources.items() - } - self._add_drone(best_match, left_resources) - if ( - not self._collecting - and not self.job_queue - and self._processing.levels.jobs == 0 - ): - break - await sampling_required.put(self) - - async def _collect_jobs(self): - async for job in self._stream_queue: - self.job_queue.append(job) - await self._processing.increase(jobs=1) - # TODO: logging happens with each job - await sampling_required.put(self.job_queue) - await sampling_required.put(UserDemand(len(self.job_queue))) - self._collecting = False - - async def job_finished(self, job): - if job.successful: - await self._processing.decrease(jobs=1) - else: - await self._stream_queue.put(job) - - def _schedule_job(self, job) -> Drone: - priorities = {} - for cluster in self.drone_cluster: - drone = cluster[0] - cost = 0 - resources = drone.theoretical_available_resources - for resource_type in job.resources: - if resources.get(resource_type, 0) < job.resources[resource_type]: - # Inf for all job resources that a drone does not support - # and all resources that are too small to even be considered - cost = float("Inf") - break - else: - try: - cost += 1 / ( - resources[resource_type] // job.resources[resource_type] - ) - except KeyError: - pass - for additional_resource_type in [ - key for key in drone.pool_resources if key not in job.resources - ]: - cost += resources[additional_resource_type] - cost /= len((*job.resources, *drone.pool_resources)) - if cost <= 1: - # directly start job - return drone - try: - priorities[cost].append(drone) - except KeyError: - priorities[cost] = [drone] - try: - minimal_key = min(priorities) - if minimal_key < float("Inf"): - return priorities[minimal_key][0] - except ValueError: - pass - return None From 0675c2fe467d68313c9a95e7b87e6220de4fd06c Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Fri, 2 Oct 2020 17:39:49 +0200 Subject: [PATCH 582/648] Updated string documentation --- lapis/connection.py | 24 ++++++--- lapis/drone.py | 29 ++++++++++- lapis/files.py | 1 + lapis/job.py | 105 +++++++++++++++++++++++++--------------- lapis/scheduler.py | 28 ++++++++--- lapis/simulator.py | 5 ++ lapis/storageelement.py | 33 ++++++++++--- 7 files changed, 162 insertions(+), 63 deletions(-) diff --git a/lapis/connection.py b/lapis/connection.py index 474d387..4d7f9c1 100644 --- a/lapis/connection.py +++ b/lapis/connection.py @@ -2,7 +2,8 @@ from typing import Union, Optional from usim import Scope, time -from monitoredpipe import MonitoredPipe +from lapis.monitoredpipe import MonitoredPipe + from lapis.cachealgorithm import ( CacheAlgorithm, @@ -25,11 +26,11 @@ class Connection(object): functionality should be tested thoroughly before being activated. TODO:: this concept should be abolished, remote storages should be created based - on configs as normal storages. There should be an additional site class that - manages the mapping of storages and drones and the connection class should be - limited to managing and directing file transfers to the correct site, if this is - even necessary. Furthermore, the mechanics for choosing between caching scenarios - should be redesigned. + on configs as normal storages. There should be an additional site class that + manages the mapping of storages and drones and the connection class should be + limited to managing and directing file transfers to the correct site, if this is + even necessary. Furthermore, the mechanics for choosing between caching scenarios + should be redesigned. """ __slots__ = ( @@ -46,7 +47,9 @@ def __init__(self, throughput=1000 * 1000 * 1000, filebased_caching=True): :param filebased_caching: """ self.storages = dict() + """dictionary containing storage objects known to the connection module""" self.remote_connection = RemoteStorage(MonitoredPipe(throughput=throughput)) + """pipe object representing the connection to a remote storage""" self.caching_algorithm = CacheAlgorithm( caching_strategy=lambda file, storage: check_size(file, storage) and check_relevance(file, storage), @@ -54,7 +57,10 @@ def __init__(self, throughput=1000 * 1000 * 1000, filebased_caching=True): file, storage ), ) + """cache behavior filebased caching, contains both caching and deletion + strategy""" self._filebased_caching = filebased_caching + """flag, true if filebased caching is current caching mode""" async def run_pipemonitoring(self): """ @@ -75,6 +81,7 @@ def add_storage_element(self, storage_element: StorageElement): """ Register storage element in Connetion module, clustering storage elements by sitename + :param storage_element: :return: """ @@ -97,6 +104,7 @@ async def _determine_inputfile_source( and the storage object where the biggest part of the file is cached is returned. If the file is not cached in any storage object the connection module remote connection is returned. + :param requested_file: :param dronesite: :param job_repr: @@ -124,6 +132,7 @@ async def stream_file( startes the files transfer. For files transfered via remote connection a potential cache decides whether to cache the file and handles the caching process. + :param requested_file: :param dronesite: :param job_repr: @@ -158,7 +167,8 @@ async def stream_file( async def transfer_files(self, drone, requested_files: dict, job_repr): """ Converts dict information about requested files to RequestedFile object and - sequentially streams all files. If there is information about input files but no informations about the file size, th + sequentially streams all files. + :param drone: :param requested_files: :param job_repr: diff --git a/lapis/drone.py b/lapis/drone.py index f25a1b3..f5fc159 100644 --- a/lapis/drone.py +++ b/lapis/drone.py @@ -32,6 +32,7 @@ def __init__( ): """ Drone initialization + :param scheduler: scheduler that assigns jobs to the drone :param pool_resources: dict of the drone's resources :param scheduling_duration: amount of time that passes between the drone's @@ -43,12 +44,21 @@ def __init__( """ super(Drone, self).__init__() self.scheduler = scheduler + """scheduler that assigns jobs to the drone""" self.connection = connection + """connection object that holds remote connection and handles file transfers""" self.sitename = sitename + """identifies the site the drone belongs to, used to determine which caches a + drone can use """ self.pool_resources = pool_resources + """dict stating the drone's resources""" self.resources = Capacities(**pool_resources) + """available resources, based on the amount of resources requested by + jobs running on the drone """ # shadowing requested resources to determine jobs to be killed self.used_resources = Capacities(**pool_resources) + """available resources, based on the amount of resources actually used by + jobs running on the drone""" if ignore_resources: self._valid_resource_keys = [ @@ -59,20 +69,30 @@ def __init__( else: self._valid_resource_keys = self.pool_resources.keys() self.scheduling_duration = scheduling_duration + """amount of time that passes between the drone's + start up and it's registration at the scheduler""" self._supply = 0 self.jobs = 0 + """number of jobs running on the drone""" self._allocation = None self._utilisation = None self._job_queue = Queue() self._empty = empty + """method that is used to determine whether a drone is empty""" # caching-related self.jobs_with_cached_data = 0 + """amount of jobs that currently run on the drone and that could read from + the cache""" self.cached_data = 0 + """used during scheduling, calculated for each job, is assigned the + expectation value for the amount of cached data that is available to the + drone""" def empty(self): """ Checks whether there are any jobs running on this drone + :return: true if no jobs are running on this drone, false else """ return self._empty(self) @@ -82,7 +102,8 @@ def theoretical_available_resources(self): """ Returns the amount of resources of the drone that were available if all jobs used exactly the amount of resources they requested - :return: + + :return: dictionary of theoretically available resources """ return dict(self.resources.levels) @@ -91,7 +112,8 @@ def available_resources(self): """ Returns the amount of resources of the drone that are available based on the amount of resources the running jobs actually use. - :return: + + :return: dictionary of available resources """ return dict(self.used_resources.levels) @@ -183,6 +205,7 @@ async def shutdown(self): async def schedule_job(self, job: Job, kill: bool = False): """ A job is scheduled to a drone by putting it in the drone's job queue. + :param job: job that was matched to the drone :param kill: flag, if true jobs can be killed if they use more resources than they requested """ @@ -198,6 +221,7 @@ async def _run_job(self, job: Job, kill: bool): requested. Then the end of the job's execution is awaited and the drones status known to the scheduler is changed. + :param job: the job to start :param kill: if True, a job is killed when used resources exceed requested resources @@ -271,6 +295,7 @@ def look_up_cached_data(self, job: Job): *Pay attention to the fact that the current implementation only works for hitrate based caching and that while KeyErrors should not occur due to the way the method is called, KeyErrors are not handled here.* + :param job: """ cached_data = 0 diff --git a/lapis/files.py b/lapis/files.py index 1f65a47..59a7f19 100644 --- a/lapis/files.py +++ b/lapis/files.py @@ -28,6 +28,7 @@ def __init__( ): """ Intialization of a stored file + :param filename: name of the file :param filesize: size of the file :param storedsize: size of the file that is actually stored, necessary if diff --git a/lapis/job.py b/lapis/job.py index 0bbcc9c..8d09628 100644 --- a/lapis/job.py +++ b/lapis/job.py @@ -1,6 +1,5 @@ import logging from typing import Optional, TYPE_CHECKING - from usim import time, Scope, instant from usim import CancelTask @@ -14,14 +13,15 @@ class Job(object): """ Objects of this class represent jobs. The job is described from the batch system's viewpoint by the following attributes: - :param resources: information about the resources requested by the job - :param used_resources: information about the resources used by the job - :param walltime: the job's runtime, in reality as well as in the simulation - :param requested_walltime: walltime requested by the job - :param cputime: the cumulated amount of time the used CPU(s) was (were) active - during the job's execution - :param queue_date: time when the job was submitted to the simulated job queue - _success: represents whether the job was run successfully + + :param resources: information about the resources requested by the job + :param used_resources: information about the resources used by the job + :param walltime: the job's runtime, in reality as well as in the simulation + :param requested_walltime: walltime requested by the job + :param cputime: the cumulated amount of time the used CPU(s) was (were) active + during the job's execution + :param queue_date: time when the job was submitted to the simulated job queue + _success: represents whether the job was run successfully In addition, functionality allowing the simulation of data transfers is provided. In this case, the job attributes have to be extended by information about the job's @@ -34,31 +34,31 @@ class Job(object): data set was transferred in a large number of blocks, the job's runtime (walltime) can be recalculated using max(calculation time, transfer time). - :param requested_inputfiles: information about the input files requested by a job - :param used_inputfiles: information about the input files actually read by a job - :param _total_input_data: data volume of used_inputfiles, amount of data - processed by the job - :param _original_walltime: the job's walltime as the simulation's input - states. Is stored for monitoring purposes as the job's walltime can be - altered - :param _calculation_time: the calculation time represents the time needed to - process the job's input data - :param calculation_efficiency: represents the efficiency of calculations - performed on the job's input data. Default = 1.0. Can be modified to take - programmatical insufficiencies into account. - :param _transfer_time: the transfer time represents the time needed to - transfer the job's input data. + :param requested_inputfiles: information about the input files requested by a job + :param used_inputfiles: information about the input files actually read by a job + :param _total_input_data: data volume of used_inputfiles, amount of data + processed by the job + :param _original_walltime: the job's walltime as the simulation's input + states. Is stored for monitoring purposes as the job's walltime can be + altered + :param _calculation_time: the calculation time represents the time needed to + process the job's input data + :param calculation_efficiency: represents the efficiency of calculations + performed on the job's input data. Default = 1.0. Can be modified to take + programmatical insufficiencies into account. + :param _transfer_time: the transfer time represents the time needed to + transfer the job's input data. As the simulation of data transfers is used to simulate and study caching, the following metadata are introduced and used for monitoring purposes. - :param _read_from_cache: true if job read data from cache - :param _cached_data: the amount of input data that is currently cached - :param failed_matches: number of times a match of this job to a resource was - rejected (see scheduler for details) - :param cache_probability: the averaged probability that all data are cached - (sum(filesize * hitrate = probability that file is cached) / sum(filesize)) - :param expectation_cached_data: the expectation value for the amount of - cached data (sum(filesize * hitrate)) + :param _read_from_cache: true if job read data from cache + :param _cached_data: the amount of input data that is currently cached + :param failed_matches: number of times a match of this job to a resource was + rejected (see scheduler for details) + :param cache_probability: the averaged probability that all data are cached + (sum(filesize * hitrate = probability that file is cached) / sum(filesize)) + :param expectation_cached_data: the expectation value for the amount of + cached data (sum(filesize * hitrate)) """ __slots__ = ( @@ -108,12 +108,16 @@ def __init__( :param in_queue_since: Time when job was inserted into the queue of the simulation scheduler :param queue_date: Time when job was inserted into queue in real life - :param name: Name of the job - :param drone: Drone where the job is running on - :param calculation_efficiency: + :param name: name of the job + :param drone: drone the job is running on + :param calculation_efficiency: efficiency of the job's calculations, + can be < 1.0 to account for programmatical insufficiencie """ self.resources = resources + """dict containing resources requested by the job""" self.used_resources = used_resources + """dict containing resources actually used by the job""" + for key in used_resources: if key not in resources: logging.getLogger("implementation").info( @@ -123,29 +127,47 @@ def __init__( ) self.resources[key] = self.used_resources[key] self.walltime: int = used_resources.pop("walltime") + """the job's runtime, in reality as well as in the simulation""" self.requested_walltime: Optional[int] = resources.pop("walltime", None) + """estimate of the job's walltime""" self.queue_date = queue_date + """ point in time when the job was submitted to the simulated job queue""" assert in_queue_since >= 0, "Queue time cannot be negative" self.in_queue_since = in_queue_since + """Time when job was inserted into the queue of the simulation scheduler""" self.in_queue_until: Optional[float] = None + """point in time when the job left the job queue""" self.drone = drone + """drone the job is executed on""" self._name = name + """identifier of the job""" self._success: Optional[bool] = None + """flag indicating whether the job was completed successfully""" self.calculation_efficiency = calculation_efficiency - + """efficiency of the job's calculations, can be < 1.0 to account for + programmatical insufficiencies""" # caching-related self.requested_inputfiles = resources.pop("inputfiles", None) + """dict of input files requested by the job and respective file sizes""" self.used_inputfiles = used_resources.pop("inputfiles", None) + """dict of input files read by the job and respective amount of read data""" self._read_from_cache = 0 + """flag indicating whether the job read from the cache""" self._cached_data = 0 + """expectation value for the amount of data that was read from a cache by + this job""" self._original_walltime = self.walltime + """stores the jobs original walltime as a reference""" self._calculation_time = 0 + """time the job takes only to perform all calculations""" self._transfer_time = 0 + """time the job takes only to transfer all input data""" # TODO: this try-except is a fix for a unit test, check whether this makes # sense in all use cases try: self.cputime = self.used_resources["cores"] * self.walltime + """walltime of the job if the CPU efficienca was always optimal""" except KeyError: self.cputime = None @@ -153,6 +175,7 @@ def __init__( self._total_input_data = sum( [fileinfo["usedsize"] for fileinfo in self.used_inputfiles.values()] ) + """total data volume of the job's input diles""" except AttributeError: self._total_input_data = 0 @@ -167,6 +190,7 @@ def __init__( ) else: self.expectation_cached_data = 0 + """amount of data that was read from the cache""" if self._total_input_data: self.cache_probability = sum( @@ -179,6 +203,8 @@ def __init__( self.cache_probability = 0 self.failed_matches = 0 + """number of times the job entered the matchmaking process but was not + scheduled to a drone""" @property def name(self) -> str: @@ -210,9 +236,9 @@ async def _calculate(self): divided by a configurable `calculation_efficiency` that can be set != 1, e.g. to account for programmatical inefficiencies. - Else, the calculation time remains equal to the job's orginal `walltime`. + Else, the calculation time remains equal to the job's orginal`walltime`. + :param calculation_efficiency: - :return: """ result = self.walltime try: @@ -250,8 +276,8 @@ async def run(self, drone: "Drone"): `_transfer_inputfiles`. The job will be executed successfully unless the selected drone does not provide enough resources, is unavailable or an exception occurs. + :param drone: the drone object the job was allocated to and is executed in - :return: """ assert drone, "Jobs cannot run without a drone being assigned" self.drone = drone @@ -290,10 +316,9 @@ def __repr__(self): async def job_to_queue_scheduler(job_generator, job_queue): """ Handles reading the simulation's job input and puts the job's into the job queue + :param job_generator: reader object that yields jobs from input :param job_queue: queue the jobs are added to - :return: - """ base_date = None for job in job_generator: diff --git a/lapis/scheduler.py b/lapis/scheduler.py index 4d17643..1d79fe5 100644 --- a/lapis/scheduler.py +++ b/lapis/scheduler.py @@ -54,6 +54,7 @@ class WrappedClassAd(ClassAd, Generic[DJ]): def __init__(self, classad: ClassAd, wrapped: DJ): """ Initialization for wrapped ClassAd + :param classad: the wrapped objects ClassAd description :param wrapped: wrapped object, either job or drone """ @@ -66,6 +67,7 @@ def empty(self): """ Only relevant for wrapped drones to determine whether there are jobs running on them. If this is the case the amount of cores in usage is >= 1. + :return: true if no CPU cores are in use, false if this is not the case """ try: @@ -76,6 +78,7 @@ def empty(self): def __getitem__(self, item): """ This method is used when evaluating classad expressions. + :param item: name of a quantity in the classad expression :return: current value of this item """ @@ -83,6 +86,7 @@ def access_wrapped(name, requested=True): """ Extracts the wrapped object's current quantity of a certain resource ( cores, memory, disk) + :param name: name of the reosurce that is to be accessed :param requested: false if name is a resource of the drone, true if name is a resource requested by a job @@ -239,7 +243,6 @@ class CondorJobScheduler(JobScheduler): exactly fits a slot or if it does fit into it several times. The cost for putting a job at a given slot is given by the amount of resources that might remain unallocated. - :return: """ def __init__(self, job_queue): @@ -452,7 +455,7 @@ class RankedAutoClusters(RankedClusters[DJ]): def __init__(self, quantization: Dict[str, HTCInt], ranking: Expression): """ - :param quantization: + :param quantization: factors to convert resources into HTCondor scalings :param ranking: prejobrank expression """ self._quantization = quantization @@ -464,6 +467,7 @@ def empty(self) -> bool: """ Checks whether all drones in the RankedCluster are empty and currently not running any jobs. + :return: """ for drones in self._clusters.values(): @@ -486,6 +490,7 @@ def add(self, item: WrappedClassAd[DJ]): sorted in into the clusters accordingly. If there are already items with the same key, the new item is added to the existing cluster. If not, a new cluster is created. + :param item: :return: """ @@ -501,6 +506,7 @@ def add(self, item: WrappedClassAd[DJ]): def remove(self, item: WrappedClassAd[DJ]): """ Removes the item. + :param item: :return: """ @@ -517,6 +523,7 @@ def _clustering_key(self, item: WrappedClassAd[DJ]): structure is (prejobrank value, (available cpus, available memory, available disk space)). The clustering key is negative as the SortedDict sorts its entries from low keys to high keys. + :param item: drone for which the clustering key is calculated. :return: (prejobrank value, (available cpus, available memory, available disk space)) @@ -547,6 +554,7 @@ def cluster_groups(self) -> Iterator[List[Set[WrappedClassAd[Drone]]]]: """ Sort clusters by the ranking key and then by the amount of available resources into nested lists of sets. + :return: """ group = [] @@ -633,6 +641,7 @@ def cluster_groups(self) -> Iterator[List[Set[WrappedClassAd[Drone]]]]: Sorts cluster by the ranking key. As there is no autoclustering, every drone is in a dedicated set and drones of the same ranking are combined into a list. These lists are then sorted by increasing ranking. + :return: iterator of the lists containing drones with identical key """ for _ranked_key, drones in self._clusters.items(): @@ -657,7 +666,6 @@ class CondorClassadJobScheduler(JobScheduler): exactly fits a slot or if it does fit into it several times. The cost for putting a job at a given slot is given by the amount of resources that might remain unallocated. - :return: """ def __init__( @@ -671,6 +679,7 @@ def __init__( ): """ Initializes the CondorClassadJobScheduler + :param job_queue: queue of jobs that are scheduled in the following simulation :param machine_ad: ClassAd that is used with every drone :param job_ad: ClassAd that is used with every job @@ -697,6 +706,7 @@ def drone_list(self) -> Iterator[Drone]: """ Takes an iterator over the WrappedClassAd objects of drones known to the scheduler, extracts the drones and returns an iterator over the drone objects. + :return: """ for cluster in self._drones.clusters(): @@ -709,6 +719,7 @@ def register_drone(self, drone: Drone): adds the resulting WrappedClassAd object to the drones known to the scheduler as well as the dictionary containing all WrappedClassAd objects the scheduler works with. + :param drone: """ wrapped_drone = WrappedClassAd(classad=self._machine_classad, wrapped=drone) @@ -718,6 +729,7 @@ def register_drone(self, drone: Drone): def unregister_drone(self, drone: Drone): """ Remove a drone's representation from the scheduler's scope. + :param drone: :return: """ @@ -727,6 +739,7 @@ def unregister_drone(self, drone: Drone): def update_drone(self, drone: Drone): """ Update a drone's representation in the scheduler scope. + :param drone: :return: """ @@ -737,6 +750,7 @@ async def run(self): """ Runs the scheduler's functionality. One executed, the scheduler starts up and begins to add the jobs that are + :return: """ async with Scope() as scope: @@ -756,6 +770,7 @@ def _match_job( ): """ Tries to find a match for the transferred job among the available drones. + :param job: job to match :param pre_job_clusters: list of clusters of wrapped drones that are presorted by a clustering mechanism of RankedAutoClusters/RankedNonClusters @@ -787,6 +802,7 @@ def debug_evaluate(expr, my, target=None): Reimplementation of the classad packages evaluate function. Having it here enables developers to inspect the ClassAd evaluation process more closely and to add debug output if necessary. + :param expr: :param my: :param target: @@ -853,7 +869,7 @@ async def _schedule_jobs(self): job queue as the index of a job in the job queue changes once a job with a lower index is removed from the queue. 4. The matched jobs' execution is triggered. - :return: + """ # Pre Job Rank is the same for all jobs # Use a copy to allow temporary "remainder after match" estimates @@ -909,9 +925,9 @@ async def _execute_job(self, job: WrappedClassAd, drone: WrappedClassAd): """ Schedules a job on a drone by extracting both objects from the respective WrappedClassAd and using the drone's scheduling functionality + :param job: :param drone: - :return: """ wrapped_job = job._wrapped wrapped_drone = drone._wrapped @@ -922,7 +938,6 @@ async def _collect_jobs(self): Combines jobs that are imported from the simulation's job config with a job ClassAd and adds the resulting WrappedClassAd objects to the scheduler's job queue. - :return: """ async for job in self._stream_queue: wrapped_job = WrappedClassAd(classad=self._job_classad, wrapped=job) @@ -942,7 +957,6 @@ async def job_finished(self, job): instance is reduced. If the job is not finished successfully, it is resubmitted to the scheduler's job queue. :param job: - :return: """ if job.successful: await self._processing.decrease(jobs=1) diff --git a/lapis/simulator.py b/lapis/simulator.py index db9c51f..7f0ff82 100644 --- a/lapis/simulator.py +++ b/lapis/simulator.py @@ -37,6 +37,11 @@ class Simulator(object): def __init__(self, seed=1234): + """ + Initializes simulator + + :param seed: random seed + """ random.seed(seed) self.job_queue: Queue = Queue() self.pools: List[Pool] = [] diff --git a/lapis/storageelement.py b/lapis/storageelement.py index 6d8d23f..f7d8916 100644 --- a/lapis/storageelement.py +++ b/lapis/storageelement.py @@ -1,7 +1,7 @@ from typing import Optional from usim import time, Resources, Scope -from monitoredpipe import MonitoredPipe +from lapis.monitoredpipe import MonitoredPipe from lapis.monitor import sampling_required from lapis.files import StoredFile, RequestedFile, RequestedFile_HitrateBased @@ -24,6 +24,7 @@ def __init__(self, pipe: MonitoredPipe): """ Initialization of the remote storages pipe, representing the network connection to remote storage with a limited bandwidth. + :param pipe: """ self.connection = pipe @@ -44,6 +45,7 @@ def used(self): async def transfer(self, file: RequestedFile, **kwargs): """ Simulates the transfer of a requested file via the remote storage's pipe. + :param file: representation of the requested file """ await self.connection.transfer(total=file.filesize) @@ -105,7 +107,8 @@ def __init__( Intialization of a storage element object. :param name: identification of the storage - :param sitename: + :param sitename: identifier, drones with the same sitename can access this + storage :param size: total size of the storage in bytes :param throughput_limit: maximal bandwidth of the network connection to this storage @@ -116,18 +119,30 @@ def __init__( information is updated """ self.name = name + """identification of the storage""" self.sitename = sitename + """identifier, drones with the same sitename can access this + storage""" self.deletion_duration = deletion_duration + """amount of time passing while a file is deleted from the storage""" self.update_duration = update_duration + """amount of time passing while a file's information is updated""" self._size = size + """size of the storage""" self.files = files + """dict of files currently in the storage""" self._usedstorage = Resources( size=sum(file.storedsize for file in files.values()) ) + """amount of storage space that is currently in use""" self.connection = MonitoredPipe(throughput_limit) + """Pipe representing the network connection to this storage + **Namespace problem between connection module and this pipe called + connection**""" self.connection.storage = repr(self) self.remote_storage = None + """remote storage that provides files that are not stored in the cache""" @property def size(self): @@ -145,9 +160,9 @@ async def remove(self, file: StoredFile, job_repr=None): """ Deletes file from storage object. The time this operation takes is defined by the storages deletion_duration attribute. + :param file: representation of the file that is removed from the storage :param job_repr: Needed for debug output, will be replaced - :return: """ await (time + self.deletion_duration) await self._usedstorage.decrease(size=file.filesize) @@ -160,9 +175,9 @@ async def add(self, file: RequestedFile, job_repr=None): to the storage when they are also transferred through the Connections remote connection. If this simulator is extended to include any kind of direct file placement this has to be adapted. + :param file: representation of the file that is added to the storage :param job_repr: Needed for debug output, will be replaced - :return: """ file = file.convert_to_stored_file_object(time.now) @@ -173,6 +188,7 @@ async def add(self, file: RequestedFile, job_repr=None): async def _update(self, stored_file: StoredFile, job_repr): """ Updates a stored files information upon access. + :param stored_file: :param job_repr: Needed for debug output, will be replaced :return: @@ -186,9 +202,9 @@ async def transfer(self, file: RequestedFile, job_repr=None): Manages file transfer via the storage elements connection and updates file information. If the file should have been deleted since it was originally looked up the resulting error is not raised. + :param file: :param job_repr: Needed for debug output, will be replaced - :return: """ await self.connection.transfer(file.filesize) try: @@ -201,6 +217,7 @@ def find(self, requested_file: RequestedFile, job_repr=None): """ Searches storage object for the requested_file and sends result (amount of cached data, storage object) to the queue. + :param requested_file: :param job_repr: Needed for debug output, will be replaced :return: (amount of cached data, storage object) @@ -246,6 +263,7 @@ def __init__( files=files, ) self._hitrate = hitrate + """global cache hitrate of this cache""" @property def available(self): @@ -261,9 +279,9 @@ async def transfer(self, file: RequestedFile, job_repr=None): of the file are found on and transferred from this storage. 1 - `_hitrate` percent of the file are transferred from the remote storage associated to the hitrate storage. + :param file: :param job_repr: - :return: """ async with Scope() as scope: logging.getLogger("implementation").warning( @@ -346,9 +364,10 @@ def find(self, requested_file: RequestedFile_HitrateBased, job_repr=None): """ Returns the expectation value for the amount of data of this file that are cached. + :param requested_file: :param job_repr: - :return: + :return: result of the lookup """ return LookUpInformation( requested_file.filesize * requested_file.cachehitrate, self From 52e54f2fb73eeb9f09944a81850ff10123839b6c Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Fri, 2 Oct 2020 17:40:32 +0200 Subject: [PATCH 583/648] Updated sphinx config to include autodoc for all major classes --- docs/conf.py | 10 +++++++--- docs/index.rst | 18 ++++++++++++++---- docs/source/topics/monitoring.rst | 2 -- 3 files changed, 21 insertions(+), 9 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index 60c0e41..63211be 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -44,7 +44,7 @@ # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. extensions = [ -# "sphinx_automodapi.automodapi", + "sphinx_automodapi.automodapi", "sphinx.ext.autosummary", "sphinx.ext.autodoc", "sphinx.ext.intersphinx", @@ -56,9 +56,13 @@ ] -autodoc_default_flags = ['members'] +autodoc_default_options = { + 'members': True, + 'private-members':True +} + +autodoc_member_order = 'groupwise' autosummary_generate = True -autodoc_mock_imports = ["cobald", "usim", "sortedcontainers", "classad"] # Add any paths that contain templates here, relative to this directory. diff --git a/docs/index.rst b/docs/index.rst index 996af2f..6c0a407 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -95,12 +95,22 @@ Indices and tables Class overview -------------- - .. autosummary:: - :toctree: stubs + :toctree: stubs + + lapis.connection + lapis.drone + lapis.files + lapis.storageelement + lapis.pool + lapis.monitoredpipe + lapis.simulator + lapis.scheduler + lapis.job + + + - lapis.files - lapis.job .. _HTCondor: https://research.cs.wisc.edu/htcondor/ diff --git a/docs/source/topics/monitoring.rst b/docs/source/topics/monitoring.rst index e48dba3..0906ca0 100644 --- a/docs/source/topics/monitoring.rst +++ b/docs/source/topics/monitoring.rst @@ -78,8 +78,6 @@ Caching-specific Monitoring ~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. autofunction:: lapis.monitor.caching.storage_status -.. autofunction:: lapis.monitor.caching.storage_connection -.. autofunction:: lapis.monitor.caching.remote_connection Telegraf -------- From edf0a44167b3941b6f397aacf3641db2ea39f422 Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Fri, 2 Oct 2020 17:46:05 +0200 Subject: [PATCH 584/648] Moved monitoredpipe.py into lapis module, deleted deprecated version of monitoredpipe.py --- lapis/monitor/caching.py | 2 +- monitoredpipe.py => lapis/monitoredpipe.py | 4 ++ monitoredpipe_old.py | 63 ---------------------- 3 files changed, 5 insertions(+), 64 deletions(-) rename monitoredpipe.py => lapis/monitoredpipe.py (91%) delete mode 100644 monitoredpipe_old.py diff --git a/lapis/monitor/caching.py b/lapis/monitor/caching.py index 352475c..baa65d6 100644 --- a/lapis/monitor/caching.py +++ b/lapis/monitor/caching.py @@ -11,7 +11,7 @@ SIMULATION_START, ) from lapis.storageelement import StorageElement -from monitoredpipe import MonitoredPipe, MonitoredPipeInfo +from lapis.monitoredpipe import MonitoredPipe, MonitoredPipeInfo import time as pytime from usim import time diff --git a/monitoredpipe.py b/lapis/monitoredpipe.py similarity index 91% rename from monitoredpipe.py rename to lapis/monitoredpipe.py index e663a94..add9a4d 100644 --- a/monitoredpipe.py +++ b/lapis/monitoredpipe.py @@ -14,12 +14,15 @@ class MonitoredPipeInfo(NamedTuple): class MonitoredPipe(Pipe): + """Implementation of the usim pipe object that can be monitored""" def __init__(self, throughput: float): super().__init__(throughput) self._monitor = Notification() self._monitor_buffers: Dict[Any, Deque[MonitoredPipeInfo]] = {} self.storage = None + """storage object the pipe simulates the network connection for, for monitoring purposes""" self.transferred_data = 0 + """total amount of data transferred by the pipe, for monitoring purposes""" async def load(self) -> AsyncIterable[MonitoredPipeInfo]: """ @@ -48,6 +51,7 @@ async def report_load(pipe: MonitoredPipe): del self._monitor_buffers[sentinel] def _throttle_subscribers(self): + """Scales down the available bandwidth for all users""" # print(time.now, "awakening monitors, throttling subscribers") self._monitor.__awake_all__() diff --git a/monitoredpipe_old.py b/monitoredpipe_old.py deleted file mode 100644 index 13bc5ef..0000000 --- a/monitoredpipe_old.py +++ /dev/null @@ -1,63 +0,0 @@ -from usim import Pipe, instant -from usim._primitives.notification import Notification -from typing import Optional - - -class MonitoredPipe(Pipe): - def __init__(self, throughput: float): - super().__init__(throughput) - self._monitor = Notification() - self.storage = None - self.transferred_data = 0 - - async def load(self): - """ - Monitor any changes of the throughput load of the pipe - .. code:: python3 - async def report_load(pipe: MonitoredPipe): - async for throughput in pipe.load(): - print(f'{time.now:6.0f}: {throughput} \t [{throughput / pipe.throughput * 100:03.0f}%]') - .. note:: - Currently only works for loads exceeding 100%. - """ - await instant - yield sum(self._subscriptions.values()) - while True: - await self._monitor - yield sum(self._subscriptions.values()) - - def _throttle_subscribers(self): - self._monitor.__awake_all__() - super()._throttle_subscribers() - - async def transfer(self, total: float, throughput: Optional[float] = None) -> None: - await super().transfer(total, throughput) - self.transferred_data += total - - def __repr__(self): - return "<%s: %s>" % (self.__class__.__name__, self.storage or id(self)) - - -if __name__ == "__main__": - from usim import time, run, Scope - - async def report_load(pipe: MonitoredPipe): - async for throughput in pipe.load(): - print( - f"{time.now:6.0f}: {throughput} \t [{throughput / pipe.throughput * 100:03.0f}%]" - ) - - async def perform_load(pipe: MonitoredPipe, delay, amount): - await (time + delay) - await pipe.transfer(amount, pipe.throughput / 2) - - async def main(): - pipe = MonitoredPipe(128) - async with Scope() as scope: - scope.do(report_load(pipe), volatile=True) - scope.do(perform_load(pipe, 0, 512)) - scope.do(perform_load(pipe, 4, 1024)) - scope.do(perform_load(pipe, 6, 128)) - scope.do(perform_load(pipe, 12, 1024)) - - run(main()) From c105b8f663ec444898857e379c0718e66e1211f9 Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Fri, 2 Oct 2020 20:36:39 +0200 Subject: [PATCH 585/648] Changed simulate.py to use FileBasedHitrateStorage instead of deprecated HitrateStorage. Issue to avoid hardcoding the storage type was created: https://github.com/tfesenbecker/lapis/issues/10 --- lapis/cli/simulate.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/lapis/cli/simulate.py b/lapis/cli/simulate.py index 619e55f..4d9f995 100644 --- a/lapis/cli/simulate.py +++ b/lapis/cli/simulate.py @@ -11,7 +11,7 @@ from lapis.pool import StaticPool, Pool from lapis.pool_io.htcondor import htcondor_pool_reader from lapis.job_io.swf import swf_job_reader -from lapis.storageelement import StorageElement, HitrateStorage +from lapis.storageelement import StorageElement, HitrateStorage, FileBasedHitrateStorage from lapis.storage_io.storage import storage_reader from lapis.scheduler import CondorJobScheduler @@ -112,9 +112,7 @@ def static(ctx, job_file, pool_file, storage_files, remote_throughput, cache_hit storage_input=storage_file, storage_content_input=storage_content_file, storage_reader=storage_import_mapper[storage_type], - storage_type=partial(HitrateStorage, cache_hitrate) - if cache_hitrate is not None - else StorageElement, + storage_type=FileBasedHitrateStorage, ) for current_pool in pool_file: pool_file, pool_file_type = current_pool From dd876d8b0f6519f105c56af8ef735a707df0ae95 Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Fri, 2 Oct 2020 20:37:20 +0200 Subject: [PATCH 586/648] Extended autodocumentation --- docs/source/topics/autodoc.rst | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) create mode 100644 docs/source/topics/autodoc.rst diff --git a/docs/source/topics/autodoc.rst b/docs/source/topics/autodoc.rst new file mode 100644 index 0000000..90052dc --- /dev/null +++ b/docs/source/topics/autodoc.rst @@ -0,0 +1,20 @@ +Detailed documentation of all relevant modules +============================================== + +.. automodule:: lapis.connection + +.. automodule:: lapis.scheduler + +.. automodule:: lapis.job + +.. automodule:: lapis.storageelement + +.. automodule:: lapis.drone + +.. automodule:: lapis.pool + +.. automodule:: lapis.files + +.. automodule:: lapis.simulator + +.. automodule:: lapis.monitoredpipe \ No newline at end of file From 378c404f8846d02a3e7f365bee43fe9c88c2f84b Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Fri, 2 Oct 2020 20:37:20 +0200 Subject: [PATCH 587/648] Extended autodocumentation --- docs/source/topics/autodoc.rst | 20 ++++++++++++++++++++ lapis/cli/simulate.py | 1 + 2 files changed, 21 insertions(+) create mode 100644 docs/source/topics/autodoc.rst diff --git a/docs/source/topics/autodoc.rst b/docs/source/topics/autodoc.rst new file mode 100644 index 0000000..90052dc --- /dev/null +++ b/docs/source/topics/autodoc.rst @@ -0,0 +1,20 @@ +Detailed documentation of all relevant modules +============================================== + +.. automodule:: lapis.connection + +.. automodule:: lapis.scheduler + +.. automodule:: lapis.job + +.. automodule:: lapis.storageelement + +.. automodule:: lapis.drone + +.. automodule:: lapis.pool + +.. automodule:: lapis.files + +.. automodule:: lapis.simulator + +.. automodule:: lapis.monitoredpipe \ No newline at end of file diff --git a/lapis/cli/simulate.py b/lapis/cli/simulate.py index 4d9f995..4adbde0 100644 --- a/lapis/cli/simulate.py +++ b/lapis/cli/simulate.py @@ -31,6 +31,7 @@ storage_import_mapper = {"standard": storage_reader} +"""Simulation CLI, pay attention to the fact that the random seed is currently set to a fixed value""" @click.group() @click.option("--seed", type=int, default=1234) From 5d14d59424c9ab0aad62efd9f24c6fa6864a7a0d Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Wed, 14 Oct 2020 14:56:08 +0200 Subject: [PATCH 588/648] Made flag for caching mode accessible in simulate.py --- lapis/cli/simulate.py | 3 ++- lapis/simulator.py | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/lapis/cli/simulate.py b/lapis/cli/simulate.py index 4adbde0..794b32f 100644 --- a/lapis/cli/simulate.py +++ b/lapis/cli/simulate.py @@ -107,7 +107,8 @@ def static(ctx, job_file, pool_file, storage_files, remote_throughput, cache_hit simulator.create_scheduler(scheduler_type=CondorJobScheduler) if all(storage_files): - simulator.create_connection_module(remote_throughput * 1024 * 1024 * 1024) + simulator.create_connection_module(remote_throughput * 1024 * 1024 * 1024, + False) storage_file, storage_content_file, storage_type = storage_files simulator.create_storage( storage_input=storage_file, diff --git a/lapis/simulator.py b/lapis/simulator.py index 7f0ff82..af18cc0 100644 --- a/lapis/simulator.py +++ b/lapis/simulator.py @@ -100,8 +100,8 @@ def create_storage( def create_scheduler(self, scheduler_type): self.job_scheduler = scheduler_type(job_queue=self.job_queue) - def create_connection_module(self, remote_throughput): - self.connection = Connection(remote_throughput) + def create_connection_module(self, remote_throughput, filebased_caching=True): + self.connection = Connection(remote_throughput, filebased_caching) def run(self, until=None): monitor.SIMULATION_START = pytime.time() From 3a27d722e74d4d704e1d498d15ef34cf85e37bb2 Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Wed, 14 Oct 2020 15:25:22 +0200 Subject: [PATCH 589/648] introduced unit test of full hitrate based caching simulation --- lapis_tests/test_caching_hitrate_based.py | 138 ++++++++-------------- 1 file changed, 49 insertions(+), 89 deletions(-) diff --git a/lapis_tests/test_caching_hitrate_based.py b/lapis_tests/test_caching_hitrate_based.py index 912bee8..76e3a51 100644 --- a/lapis_tests/test_caching_hitrate_based.py +++ b/lapis_tests/test_caching_hitrate_based.py @@ -5,14 +5,14 @@ from lapis_tests import via_usim, DummyDrone, DummyJob from lapis.connection import Connection -from lapis.storageelement import HitrateStorage +from lapis.storageelement import FileBasedHitrateStorage, HitrateStorage from lapis.storage_io.storage import storage_reader from lapis.files import RequestedFile from lapis.simulator import Simulator from lapis.job_io.htcondor import htcondor_job_reader from lapis.pool import StaticPool from lapis.pool_io.htcondor import htcondor_pool_reader -from lapis.scheduler import CondorJobScheduler +from lapis.scheduler import CondorJobScheduler, CondorClassadJobScheduler class TestHitrateCaching(object): @@ -94,121 +94,81 @@ async def test_simultaneous_transfer(self): assert stream_time == 15 @via_usim - async def test_caching_simulation_duration_short_jobs(self): - simulator = Simulator() - with NamedTemporaryFile(suffix=".csv") as machine_config, NamedTemporaryFile( - suffix=".csv" - ) as storage_config, NamedTemporaryFile(suffix=".json") as job_config: + async def test_full_simulation_with_hitratebased_caching(self): + with NamedTemporaryFile(suffix=".csv") as machine_config, \ + NamedTemporaryFile(suffix=".csv") as storage_config, \ + NamedTemporaryFile(suffix=".json") as job_config: with open(machine_config.name, "w") as write_stream: write_stream.write( - "TotalSlotCPUs TotalSlotDisk TotalSlotMemory Count sitename\n" - "1 44624348.0 8000 1 site1" + "TotalSlotCPUs TotalSlotDisk TotalSlotMemory Count sitename \n" + "1 44624348.0 4000 1 mysite" ) with open(job_config.name, "w") as write_stream: job_description = [ { - "QDate": 0, "RequestCpus": 1, "RequestWalltime": 60, - "RequestMemory": 1024, - "RequestDisk": 1024, - "RemoteWallClockTime": 1.0, - "MemoryUsage": 1024, - "DiskUsage_RAW": 1024, - "RemoteSysCpu": 1.0, - "RemoteUserCpu": 0.0, - "Inputfiles": dict( - file1=dict(usedsize=10), file2=dict(usedsize=5) - ), + "RequestMemory": 2000, + "RequestDisk": 6000000, + "QDate": 0, + "RemoteWallClockTime": 42, + "Number of Allocated Processors": 1, + "MemoryUsage": 1500, + "DiskUsage_RAW": 41898, + "RemoteSysCpu": 40, + "RemoteUserCpu": 2, + "Inputfiles": { + "a.root": { + "filesize": 5, + "usedsize": 5, + "hitrates": { + "mysite": 1.0 + } + }, + "b.root": { + "filesize": 5, + "usedsize": 5, + "hitrates": { + "mysite": 0.0 + } + } + } } - ] * 2 + ] json.dump(job_description, write_stream) with open(storage_config.name, "w") as write_stream: write_stream.write( - "name sitename cachesizeGB throughput_limit\n" - "cache1 site1 1000 1.0" + "name sitename cachesizeGB throughput_limit \n" + "mycache mysite 1000 1.0" ) job_input = open(job_config.name, "r+") machine_input = open(machine_config.name, "r+") storage_input = open(storage_config.name, "r+") storage_content_input = None - cache_hitrate = 0.5 + + simulator = Simulator() simulator.create_job_generator( - job_input=job_input, job_reader=htcondor_job_reader - ) - simulator.create_scheduler(scheduler_type=CondorJobScheduler) - simulator.create_connection_module(remote_throughput=1.0) - simulator.create_pools( - pool_input=machine_input, - pool_reader=htcondor_pool_reader, - pool_type=StaticPool, + job_input=job_input, + job_reader=htcondor_job_reader ) + simulator.create_scheduler(scheduler_type=CondorClassadJobScheduler) + simulator.create_connection_module(remote_throughput=0.1 * 1000 * 1000 * + 1000, + filebased_caching=False) simulator.create_storage( storage_input=storage_input, storage_content_input=storage_content_input, storage_reader=storage_reader, - storage_type=partial(HitrateStorage, cache_hitrate), + storage_type=FileBasedHitrateStorage, ) - simulator.run() - assert 180 == simulator.duration - - @via_usim - async def test_caching_simulation_duration_long_jobs(self): - simulator = Simulator() - with NamedTemporaryFile(suffix=".csv") as machine_config, NamedTemporaryFile( - suffix=".csv" - ) as storage_config, NamedTemporaryFile(suffix=".json") as job_config: - with open(machine_config.name, "w") as write_stream: - write_stream.write( - "TotalSlotCPUs TotalSlotDisk TotalSlotMemory Count sitename\n" - "1 44624348.0 8000 1 site1" - ) - with open(job_config.name, "w") as write_stream: - job_description = [ - { - "QDate": 0, - "RequestCpus": 1, - "RequestWalltime": 60, - "RequestMemory": 1024, - "RequestDisk": 1024, - "RemoteWallClockTime": 1.0, - "MemoryUsage": 1024, - "DiskUsage_RAW": 1024, - "RemoteSysCpu": 1.0, - "RemoteUserCpu": 0.0, - "Inputfiles": dict( - file1=dict(usedsize=60), file2=dict(usedsize=60) - ), - } - ] * 2 - json.dump(job_description, write_stream) - with open(storage_config.name, "w") as write_stream: - write_stream.write( - "name sitename cachesizeGB throughput_limit\n" - "cache1 site1 1000 1.0" - ) - - job_input = open(job_config.name, "r+") - machine_input = open(machine_config.name, "r+") - storage_input = open(storage_config.name, "r+") - storage_content_input = None - cache_hitrate = 0.5 - simulator.create_job_generator( - job_input=job_input, job_reader=htcondor_job_reader - ) - simulator.create_scheduler(scheduler_type=CondorJobScheduler) - simulator.create_connection_module(remote_throughput=1.0) simulator.create_pools( pool_input=machine_input, pool_reader=htcondor_pool_reader, pool_type=StaticPool, ) - simulator.create_storage( - storage_input=storage_input, - storage_content_input=storage_content_input, - storage_reader=storage_reader, - storage_type=partial(HitrateStorage, cache_hitrate), - ) + + simulator.enable_monitoring() simulator.run() - assert 300 == simulator.duration + assert 180 == simulator.duration + From 068f5171e2d40afc0a63635708dd62587b2022e3 Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Wed, 14 Oct 2020 16:29:05 +0200 Subject: [PATCH 590/648] RemoteStorage constructor now takes bandwidth as an argument instead of MonitoredPipe object Closes issue #14 --- lapis/connection.py | 4 ++-- lapis/storageelement.py | 7 +++--- lapis_tests/test_caching_hitrate_based.py | 23 +++++++++++++------ lapis_tests/test_job.py | 2 +- lapis_tests/test_remote_storage.py | 28 +++++++++++++++++++++++ 5 files changed, 51 insertions(+), 13 deletions(-) create mode 100644 lapis_tests/test_remote_storage.py diff --git a/lapis/connection.py b/lapis/connection.py index 4d7f9c1..8d26cbf 100644 --- a/lapis/connection.py +++ b/lapis/connection.py @@ -40,7 +40,7 @@ class Connection(object): "_filebased_caching", ) - def __init__(self, throughput=1000 * 1000 * 1000, filebased_caching=True): + def __init__(self, throughput, filebased_caching=True): """ Intialization of the connection object :param throughput: throughput of the connection's remote storage @@ -48,7 +48,7 @@ def __init__(self, throughput=1000 * 1000 * 1000, filebased_caching=True): """ self.storages = dict() """dictionary containing storage objects known to the connection module""" - self.remote_connection = RemoteStorage(MonitoredPipe(throughput=throughput)) + self.remote_connection = RemoteStorage(throughput=throughput) """pipe object representing the connection to a remote storage""" self.caching_algorithm = CacheAlgorithm( caching_strategy=lambda file, storage: check_size(file, storage) diff --git a/lapis/storageelement.py b/lapis/storageelement.py index f7d8916..0bd65e4 100644 --- a/lapis/storageelement.py +++ b/lapis/storageelement.py @@ -20,15 +20,16 @@ class RemoteStorage(Storage): connections. """ # TODO:: ensure that there can be multiple remote storages in the simulation - def __init__(self, pipe: MonitoredPipe): + def __init__(self, throughput: float): """ Initialization of the remote storages pipe, representing the network connection to remote storage with a limited bandwidth. :param pipe: """ - self.connection = pipe - pipe.storage = repr(self) + conversion_GB_to_B = 1000 * 1000 * 1000 + self.connection = MonitoredPipe(throughput=throughput * conversion_GB_to_B) + self.connection.storage = repr(self) @property def size(self): diff --git a/lapis_tests/test_caching_hitrate_based.py b/lapis_tests/test_caching_hitrate_based.py index 76e3a51..df48551 100644 --- a/lapis_tests/test_caching_hitrate_based.py +++ b/lapis_tests/test_caching_hitrate_based.py @@ -14,6 +14,7 @@ from lapis.pool_io.htcondor import htcondor_pool_reader from lapis.scheduler import CondorJobScheduler, CondorClassadJobScheduler +conversion_GB_to_B = 1000 * 1000 * 1000 class TestHitrateCaching(object): def test_hitratestorage(self): @@ -53,8 +54,11 @@ async def test_determine_inputfile_source(self): async def test_stream_file(self): throughput = 10 size = 1000 - requested_file = RequestedFile(filename="testfile", filesize=100) + requested_file = RequestedFile(filename="testfile", + filesize=100 * conversion_GB_to_B) hitratestorage = HitrateStorage(hitrate=0.5, size=size, files={}) + # does not transfer from cache but from remote storage as there are no files + # in the HitrateStorage connection = Connection(throughput=throughput) connection.add_storage_element(hitratestorage) assert 0 == time.now @@ -67,8 +71,11 @@ async def test_single_transfer_files(self): size = 1000 drone = DummyDrone(throughput) job = DummyJob(True) - requested_files = dict(test=dict(usedsize=100, hitrates={drone.sitename: 1.0})) + requested_files = dict(test=dict(usedsize=100 * conversion_GB_to_B, + hitrates={drone.sitename: 1.0})) hitratestorage = HitrateStorage(hitrate=0.5, size=size, files={}) + # does not transfer from cache but from remote storage as there are no files + # in the HitrateStorage drone.connection.add_storage_element(hitratestorage) stream_time = await drone.connection.transfer_files( drone=drone, requested_files=requested_files, job_repr=job @@ -83,10 +90,14 @@ async def test_simultaneous_transfer(self): size = 1000 drone = DummyDrone(throughput) job = DummyJob(True) - requested_files = dict(test1=dict(usedsize=100, hitrates={drone.sitename: 1.0}), - test2=dict(usedsize=200, hitrates={drone.sitename: 1.0})) + requested_files = dict(test1=dict(usedsize=100 * conversion_GB_to_B, + hitrates={drone.sitename: 1.0}), + test2=dict(usedsize=200 * conversion_GB_to_B, hitrates={ + drone.sitename: 1.0})) hitratestorage = HitrateStorage(hitrate=0.5, size=size, files={}) drone.connection.add_storage_element(hitratestorage) + # does not transfer from cache but from remote storage as there are no files + # in the HitrateStorage stream_time = await drone.connection.transfer_files( drone=drone, requested_files=requested_files, job_repr=job ) @@ -153,9 +164,7 @@ async def test_full_simulation_with_hitratebased_caching(self): job_reader=htcondor_job_reader ) simulator.create_scheduler(scheduler_type=CondorClassadJobScheduler) - simulator.create_connection_module(remote_throughput=0.1 * 1000 * 1000 * - 1000, - filebased_caching=False) + simulator.create_connection_module(remote_throughput=0.1, filebased_caching=False) simulator.create_storage( storage_input=storage_input, storage_content_input=storage_content_input, diff --git a/lapis_tests/test_job.py b/lapis_tests/test_job.py index 29c8930..9c08001 100644 --- a/lapis_tests/test_job.py +++ b/lapis_tests/test_job.py @@ -49,7 +49,7 @@ async def test_job_in_drone(self): scheduler=scheduler, pool_resources={"cores": 1, "memory": 1}, scheduling_duration=0, - connection=Connection(), + connection=Connection(throughput=1), ) async with Scope() as scope: scope.do(drone.run(), volatile=True) diff --git a/lapis_tests/test_remote_storage.py b/lapis_tests/test_remote_storage.py new file mode 100644 index 0000000..5930c4a --- /dev/null +++ b/lapis_tests/test_remote_storage.py @@ -0,0 +1,28 @@ +from usim import time + +from lapis.storageelement import RemoteStorage +from lapis_tests import via_usim +from lapis.files import RequestedFile + + +class TestRemoteStorage(object): + + def test_throughput(self): + remote_storage = RemoteStorage(2.0) + assert remote_storage.connection.throughput == 2000000000 + + def test_size(self): + remote_storage = RemoteStorage(1.0) + assert remote_storage.size == float("Inf") + assert remote_storage.available == float("Inf") + assert remote_storage.used == 0 + + @via_usim + async def test_transfer(self): + remote_storage = RemoteStorage(1.0) + requested_file = RequestedFile("testfile", 10*1000*1000*1000) + await remote_storage.transfer(requested_file) + assert time.now == 10 + + + From 2e1cd83e2c064c9e58bab768a67afc74f3736690 Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Wed, 14 Oct 2020 16:29:05 +0200 Subject: [PATCH 591/648] RemoteStorage constructor now takes bandwidth as an argument instead of MonitoredPipe object Closes issue #14 --- lapis/cli/simulate.py | 3 +-- lapis/connection.py | 4 ++-- lapis/storageelement.py | 7 +++--- lapis_tests/test_caching_hitrate_based.py | 23 +++++++++++++------ lapis_tests/test_job.py | 2 +- lapis_tests/test_remote_storage.py | 28 +++++++++++++++++++++++ 6 files changed, 52 insertions(+), 15 deletions(-) create mode 100644 lapis_tests/test_remote_storage.py diff --git a/lapis/cli/simulate.py b/lapis/cli/simulate.py index 794b32f..0819bfd 100644 --- a/lapis/cli/simulate.py +++ b/lapis/cli/simulate.py @@ -107,8 +107,7 @@ def static(ctx, job_file, pool_file, storage_files, remote_throughput, cache_hit simulator.create_scheduler(scheduler_type=CondorJobScheduler) if all(storage_files): - simulator.create_connection_module(remote_throughput * 1024 * 1024 * 1024, - False) + simulator.create_connection_module(remote_throughput, False) storage_file, storage_content_file, storage_type = storage_files simulator.create_storage( storage_input=storage_file, diff --git a/lapis/connection.py b/lapis/connection.py index 4d7f9c1..8d26cbf 100644 --- a/lapis/connection.py +++ b/lapis/connection.py @@ -40,7 +40,7 @@ class Connection(object): "_filebased_caching", ) - def __init__(self, throughput=1000 * 1000 * 1000, filebased_caching=True): + def __init__(self, throughput, filebased_caching=True): """ Intialization of the connection object :param throughput: throughput of the connection's remote storage @@ -48,7 +48,7 @@ def __init__(self, throughput=1000 * 1000 * 1000, filebased_caching=True): """ self.storages = dict() """dictionary containing storage objects known to the connection module""" - self.remote_connection = RemoteStorage(MonitoredPipe(throughput=throughput)) + self.remote_connection = RemoteStorage(throughput=throughput) """pipe object representing the connection to a remote storage""" self.caching_algorithm = CacheAlgorithm( caching_strategy=lambda file, storage: check_size(file, storage) diff --git a/lapis/storageelement.py b/lapis/storageelement.py index f7d8916..0bd65e4 100644 --- a/lapis/storageelement.py +++ b/lapis/storageelement.py @@ -20,15 +20,16 @@ class RemoteStorage(Storage): connections. """ # TODO:: ensure that there can be multiple remote storages in the simulation - def __init__(self, pipe: MonitoredPipe): + def __init__(self, throughput: float): """ Initialization of the remote storages pipe, representing the network connection to remote storage with a limited bandwidth. :param pipe: """ - self.connection = pipe - pipe.storage = repr(self) + conversion_GB_to_B = 1000 * 1000 * 1000 + self.connection = MonitoredPipe(throughput=throughput * conversion_GB_to_B) + self.connection.storage = repr(self) @property def size(self): diff --git a/lapis_tests/test_caching_hitrate_based.py b/lapis_tests/test_caching_hitrate_based.py index 76e3a51..df48551 100644 --- a/lapis_tests/test_caching_hitrate_based.py +++ b/lapis_tests/test_caching_hitrate_based.py @@ -14,6 +14,7 @@ from lapis.pool_io.htcondor import htcondor_pool_reader from lapis.scheduler import CondorJobScheduler, CondorClassadJobScheduler +conversion_GB_to_B = 1000 * 1000 * 1000 class TestHitrateCaching(object): def test_hitratestorage(self): @@ -53,8 +54,11 @@ async def test_determine_inputfile_source(self): async def test_stream_file(self): throughput = 10 size = 1000 - requested_file = RequestedFile(filename="testfile", filesize=100) + requested_file = RequestedFile(filename="testfile", + filesize=100 * conversion_GB_to_B) hitratestorage = HitrateStorage(hitrate=0.5, size=size, files={}) + # does not transfer from cache but from remote storage as there are no files + # in the HitrateStorage connection = Connection(throughput=throughput) connection.add_storage_element(hitratestorage) assert 0 == time.now @@ -67,8 +71,11 @@ async def test_single_transfer_files(self): size = 1000 drone = DummyDrone(throughput) job = DummyJob(True) - requested_files = dict(test=dict(usedsize=100, hitrates={drone.sitename: 1.0})) + requested_files = dict(test=dict(usedsize=100 * conversion_GB_to_B, + hitrates={drone.sitename: 1.0})) hitratestorage = HitrateStorage(hitrate=0.5, size=size, files={}) + # does not transfer from cache but from remote storage as there are no files + # in the HitrateStorage drone.connection.add_storage_element(hitratestorage) stream_time = await drone.connection.transfer_files( drone=drone, requested_files=requested_files, job_repr=job @@ -83,10 +90,14 @@ async def test_simultaneous_transfer(self): size = 1000 drone = DummyDrone(throughput) job = DummyJob(True) - requested_files = dict(test1=dict(usedsize=100, hitrates={drone.sitename: 1.0}), - test2=dict(usedsize=200, hitrates={drone.sitename: 1.0})) + requested_files = dict(test1=dict(usedsize=100 * conversion_GB_to_B, + hitrates={drone.sitename: 1.0}), + test2=dict(usedsize=200 * conversion_GB_to_B, hitrates={ + drone.sitename: 1.0})) hitratestorage = HitrateStorage(hitrate=0.5, size=size, files={}) drone.connection.add_storage_element(hitratestorage) + # does not transfer from cache but from remote storage as there are no files + # in the HitrateStorage stream_time = await drone.connection.transfer_files( drone=drone, requested_files=requested_files, job_repr=job ) @@ -153,9 +164,7 @@ async def test_full_simulation_with_hitratebased_caching(self): job_reader=htcondor_job_reader ) simulator.create_scheduler(scheduler_type=CondorClassadJobScheduler) - simulator.create_connection_module(remote_throughput=0.1 * 1000 * 1000 * - 1000, - filebased_caching=False) + simulator.create_connection_module(remote_throughput=0.1, filebased_caching=False) simulator.create_storage( storage_input=storage_input, storage_content_input=storage_content_input, diff --git a/lapis_tests/test_job.py b/lapis_tests/test_job.py index 29c8930..9c08001 100644 --- a/lapis_tests/test_job.py +++ b/lapis_tests/test_job.py @@ -49,7 +49,7 @@ async def test_job_in_drone(self): scheduler=scheduler, pool_resources={"cores": 1, "memory": 1}, scheduling_duration=0, - connection=Connection(), + connection=Connection(throughput=1), ) async with Scope() as scope: scope.do(drone.run(), volatile=True) diff --git a/lapis_tests/test_remote_storage.py b/lapis_tests/test_remote_storage.py new file mode 100644 index 0000000..5930c4a --- /dev/null +++ b/lapis_tests/test_remote_storage.py @@ -0,0 +1,28 @@ +from usim import time + +from lapis.storageelement import RemoteStorage +from lapis_tests import via_usim +from lapis.files import RequestedFile + + +class TestRemoteStorage(object): + + def test_throughput(self): + remote_storage = RemoteStorage(2.0) + assert remote_storage.connection.throughput == 2000000000 + + def test_size(self): + remote_storage = RemoteStorage(1.0) + assert remote_storage.size == float("Inf") + assert remote_storage.available == float("Inf") + assert remote_storage.used == 0 + + @via_usim + async def test_transfer(self): + remote_storage = RemoteStorage(1.0) + requested_file = RequestedFile("testfile", 10*1000*1000*1000) + await remote_storage.transfer(requested_file) + assert time.now == 10 + + + From 64d83c2a620e810a49a53042ac703281e43e583e Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Thu, 15 Oct 2020 12:33:56 +0200 Subject: [PATCH 592/648] ensure that FileBasedHitrateStorage does not contain files --- lapis/storageelement.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapis/storageelement.py b/lapis/storageelement.py index 0bd65e4..2efdf0e 100644 --- a/lapis/storageelement.py +++ b/lapis/storageelement.py @@ -341,7 +341,7 @@ def __init__( sitename=sitename, size=size, throughput_limit=throughput_limit, - files=files, + files={}, ) @property From 8e88c077b824ad209e309004a4c85ec6d324234f Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Thu, 15 Oct 2020 12:34:39 +0200 Subject: [PATCH 593/648] Implemented tests for calculation time and transfer time in Job class --- lapis_tests/test_job_caching.py | 97 ++++++++++++++++++++++++++++++--- 1 file changed, 88 insertions(+), 9 deletions(-) diff --git a/lapis_tests/test_job_caching.py b/lapis_tests/test_job_caching.py index d0d01b3..8caac2e 100644 --- a/lapis_tests/test_job_caching.py +++ b/lapis_tests/test_job_caching.py @@ -1,16 +1,95 @@ -import pytest -from usim import Scope, time +from usim import time -from lapis.drone import Drone from lapis.job import Job -from lapis_tests import via_usim, DummyScheduler, DummyDrone -from lapis.connection import Connection +from lapis_tests import via_usim, DummyDrone class TestJobCaching(object): - def test_calculation_time(self): - pass + @via_usim + async def test_calculation_time(self): + self.job = Job(resources={"walltime": 60}, + used_resources={"walltime": 10, "cores": 0.7}) + self.job.drone = DummyDrone(1) + starttime = time.now + await self.job._calculate() + assert time.now - starttime == 10 + + self.job = Job(resources={"walltime": 60, "inputfiles": {"file"}}, + used_resources={"walltime": 10, "cores": 0.7}) + self.job.drone = DummyDrone(1) + starttime = time.now + await self.job._calculate() + assert time.now - starttime == 7 + + self.job = Job(resources={"walltime": 60, "inputfiles": {"file"}}, + used_resources={"walltime": 10, "cores": 0.7}, + calculation_efficiency=0.5) + self.job.drone = DummyDrone(1) + starttime = time.now + await self.job._calculate() + assert time.now - starttime == 14 + + self.job = Job(resources={"walltime": 60, "inputfiles": {"file"}}, + used_resources={"walltime": 10}, + calculation_efficiency=0.5) + self.job.drone = DummyDrone(1) + starttime = time.now + await self.job._calculate() + assert time.now - starttime == 10 + + @via_usim + async def test_transfer_time(self): + conversion_GB_to_B = 1000 * 1000 * 1000 + drone = DummyDrone(1) + self.job = Job(resources={"walltime": 60, + "inputfiles": {"file": {"usedsize": 20 *conversion_GB_to_B}}}, + used_resources={"walltime": 10, + "inputfiles": { + "file": {"usedsize": 20 * conversion_GB_to_B, + "hitrates": {}}} + }, + calculation_efficiency=1.0) + + self.job.drone = drone + starttime = time.now + await self.job._transfer_inputfiles() + assert time.now - starttime == 20 + + self.job = Job(resources={"walltime": 60}, + used_resources={"walltime": 10}, + calculation_efficiency=1.0) + + self.job.drone = drone + starttime = time.now + await self.job._transfer_inputfiles() + assert time.now - starttime == 0 + + self.job = Job(resources={"walltime": 60, + "inputfiles": { + "file": {"usedsize": 20 *conversion_GB_to_B}}}, + used_resources={"walltime": 10}, + calculation_efficiency=1.0) + + self.job.drone = drone + starttime = time.now + await self.job._transfer_inputfiles() + assert time.now - starttime == 0 + + self.job = Job(resources={"walltime": 60, + "inputfiles": { + "file": {"usedsize": 20 * conversion_GB_to_B}}}, + used_resources={"walltime": 10, + "inputfiles": { + "file": {"usedsize": 20 * + conversion_GB_to_B, + "hitrates": {}}, + } + }, + calculation_efficiency=1.0) + + self.job.drone = drone + starttime = time.now + await self.job._transfer_inputfiles() + assert time.now - starttime == 20 - def test_transfer_time(self): - pass From c9be8f07f3906c1ddf5075e9fef18443206a3ac7 Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Thu, 15 Oct 2020 12:36:34 +0200 Subject: [PATCH 594/648] Added test for FileBasedHitrateStorage class --- lapis_tests/test_storage_filebasedhitrate.py | 53 ++++++++++++++++++++ 1 file changed, 53 insertions(+) create mode 100644 lapis_tests/test_storage_filebasedhitrate.py diff --git a/lapis_tests/test_storage_filebasedhitrate.py b/lapis_tests/test_storage_filebasedhitrate.py new file mode 100644 index 0000000..2861811 --- /dev/null +++ b/lapis_tests/test_storage_filebasedhitrate.py @@ -0,0 +1,53 @@ +from lapis.storageelement import FileBasedHitrateStorage +from lapis_tests import via_usim, DummyJob +from lapis.files import RequestedFile_HitrateBased +from lapis.storageelement import LookUpInformation + +from usim import time + +import pytest + +class TestFileBasedHitrateStorag(): + + def test_storage_initialization(self): + filebasedhitratestorage = FileBasedHitrateStorage(name="name", sitename="site", + size=200, throughput_limit=1) + assert filebasedhitratestorage.files == {} + assert filebasedhitratestorage.name == "name" + assert filebasedhitratestorage.sitename == "site" + assert filebasedhitratestorage.size == 200 + assert filebasedhitratestorage.connection.throughput == 1 + + assert filebasedhitratestorage.available == 200 + assert filebasedhitratestorage.used == 0 + + @via_usim + async def test_transfer(self): + filebasedhitratestorage = FileBasedHitrateStorage(name="name", sitename="site", + size=200, throughput_limit=1) + requestedFile = RequestedFile_HitrateBased("filename", 20, 1) + await filebasedhitratestorage.transfer(requestedFile, DummyJob()) + assert time.now == 20 + + with pytest.raises(ValueError): + requestedFile = RequestedFile_HitrateBased("filename", 20, 0) + await filebasedhitratestorage.transfer(requestedFile, DummyJob()) + + def test_find_file_in_storage(self): + filebasedhitratestorage = FileBasedHitrateStorage(name="name", sitename="site", + size=200, throughput_limit=1) + requestedFile = RequestedFile_HitrateBased("filename", 20, 1) + foundFile = LookUpInformation(20, filebasedhitratestorage) + + assert filebasedhitratestorage.find(requestedFile) == foundFile + + def test_modification_of_stored_files(self): + filebasedhitratestorage = FileBasedHitrateStorage(name="name", sitename="site", + size=200, throughput_limit=1) + requestedFile = RequestedFile_HitrateBased("filename", 20, 1) + + filebasedhitratestorage.add(requestedFile) + assert filebasedhitratestorage.files == {} + + filebasedhitratestorage.remove(requestedFile) + assert filebasedhitratestorage.files == {} From 0a79525ad0c3418893232fb084d9437b625696a5 Mon Sep 17 00:00:00 2001 From: tfesenbecker Date: Mon, 19 Oct 2020 08:11:56 +0200 Subject: [PATCH 595/648] Fixed typos in string documentation --- lapis/cli/simulate.py | 4 +++- lapis/connection.py | 15 +++++++-------- lapis/files.py | 2 +- lapis/interfaces/_storage.py | 2 +- lapis/job.py | 16 +++++++--------- lapis/monitoredpipe.py | 3 ++- lapis/scheduler.py | 4 ++-- lapis/storageelement.py | 2 +- 8 files changed, 24 insertions(+), 24 deletions(-) diff --git a/lapis/cli/simulate.py b/lapis/cli/simulate.py index 0819bfd..6c2e0cb 100644 --- a/lapis/cli/simulate.py +++ b/lapis/cli/simulate.py @@ -31,7 +31,9 @@ storage_import_mapper = {"standard": storage_reader} -"""Simulation CLI, pay attention to the fact that the random seed is currently set to a fixed value""" +"""Simulation CLI, pay attention to the fact that the random seed is currently set to a +fixed value""" + @click.group() @click.option("--seed", type=int, default=1234) diff --git a/lapis/connection.py b/lapis/connection.py index 8d26cbf..1ad2605 100644 --- a/lapis/connection.py +++ b/lapis/connection.py @@ -42,7 +42,7 @@ class Connection(object): def __init__(self, throughput, filebased_caching=True): """ - Intialization of the connection object + Initialization of the connection object :param throughput: throughput of the connection's remote storage :param filebased_caching: """ @@ -57,10 +57,10 @@ def __init__(self, throughput, filebased_caching=True): file, storage ), ) - """cache behavior filebased caching, contains both caching and deletion + """cache behavior file based caching, contains both caching and deletion strategy""" self._filebased_caching = filebased_caching - """flag, true if filebased caching is current caching mode""" + """flag, true if file based caching is current caching mode""" async def run_pipemonitoring(self): """ @@ -79,7 +79,7 @@ async def report_load_to_monitoring(pipe: MonitoredPipe): def add_storage_element(self, storage_element: StorageElement): """ - Register storage element in Connetion module, clustering storage elements by + Register storage element in Connection module, clustering storage elements by sitename :param storage_element: @@ -108,7 +108,7 @@ async def _determine_inputfile_source( :param requested_file: :param dronesite: :param job_repr: - :return: + :return: pipe that will be used for file transfer """ provided_storages = self.storages.get(dronesite, None) if provided_storages is not None: @@ -129,14 +129,13 @@ async def stream_file( ): """ Determines which storage object is used to provide the requested file and - startes the files transfer. For files transfered via remote connection a + starts the files transfer. For files transferred via remote connection a potential cache decides whether to cache the file and handles the caching process. :param requested_file: :param dronesite: :param job_repr: - :return: """ used_connection = await self._determine_inputfile_source( requested_file, dronesite, job_repr @@ -172,7 +171,7 @@ async def transfer_files(self, drone, requested_files: dict, job_repr): :param drone: :param requested_files: :param job_repr: - :return: + :return: time that passed while file was transferred """ start_time = time.now diff --git a/lapis/files.py b/lapis/files.py index 59a7f19..e920298 100644 --- a/lapis/files.py +++ b/lapis/files.py @@ -27,7 +27,7 @@ def __init__( **filespecs, ): """ - Intialization of a stored file + Initialization of a stored file :param filename: name of the file :param filesize: size of the file diff --git a/lapis/interfaces/_storage.py b/lapis/interfaces/_storage.py index 7ff3905..0735b3f 100644 --- a/lapis/interfaces/_storage.py +++ b/lapis/interfaces/_storage.py @@ -55,7 +55,7 @@ async def add(self, file: RequestedFile, job_repr): @abc.abstractmethod async def remove(self, file: StoredFile, job_repr): """ - Remove all file information and used filesize from the storage. + Remove all file information and used file size from the storage. """ raise NotImplementedError diff --git a/lapis/job.py b/lapis/job.py index 8d09628..779c9d7 100644 --- a/lapis/job.py +++ b/lapis/job.py @@ -28,7 +28,7 @@ class Job(object): input data. In this case the job's runtime is recalculated if the job processes input data and is executed on resources with access to caches. In this case data transfer and processing are assumed to be done in parallel. This is a valid - assumption if the input data are divided into blocks, transfered + assumption if the input data are divided into blocks, transferred throughout the job's runtime and if already transferred data blocks are processed while other blocks are fetched. If the job's overall runtime is long and if the data set was transferred in a large number of blocks, the job's runtime (walltime) @@ -111,7 +111,7 @@ def __init__( :param name: name of the job :param drone: drone the job is running on :param calculation_efficiency: efficiency of the job's calculations, - can be < 1.0 to account for programmatical insufficiencie + can be < 1.0 to account for programmatical insufficiencies """ self.resources = resources """dict containing resources requested by the job""" @@ -167,7 +167,7 @@ def __init__( # sense in all use cases try: self.cputime = self.used_resources["cores"] * self.walltime - """walltime of the job if the CPU efficienca was always optimal""" + """walltime of the job if the CPU efficiency = 1.0""" except KeyError: self.cputime = None @@ -175,11 +175,11 @@ def __init__( self._total_input_data = sum( [fileinfo["usedsize"] for fileinfo in self.used_inputfiles.values()] ) - """total data volume of the job's input diles""" + """total data volume of the job's input files""" except AttributeError: self._total_input_data = 0 - # TODO: see unit test test_read_with_inputfiles -> decide whether making + # TODO: see unit test test_read_with_inputfiles -> making # information about hitrates obilgatory is actually necessary if self._total_input_data: self.expectation_cached_data = sum( @@ -234,11 +234,9 @@ async def _calculate(self): If a job contains input files and the drone the job runs on has a defined remote connection (throughput < Inf) the calculation time is given by job's CPU time divided by a configurable `calculation_efficiency` that can be set != 1, e.g. to - account for programmatical inefficiencies. + account for programmatic inefficiencies. - Else, the calculation time remains equal to the job's orginal`walltime`. - - :param calculation_efficiency: + Else, the calculation time remains equal to the job's original `walltime`. """ result = self.walltime try: diff --git a/lapis/monitoredpipe.py b/lapis/monitoredpipe.py index add9a4d..640aee6 100644 --- a/lapis/monitoredpipe.py +++ b/lapis/monitoredpipe.py @@ -20,7 +20,8 @@ def __init__(self, throughput: float): self._monitor = Notification() self._monitor_buffers: Dict[Any, Deque[MonitoredPipeInfo]] = {} self.storage = None - """storage object the pipe simulates the network connection for, for monitoring purposes""" + """storage object the pipe simulates the network connection for, for monitoring + purposes""" self.transferred_data = 0 """total amount of data transferred by the pipe, for monitoring purposes""" diff --git a/lapis/scheduler.py b/lapis/scheduler.py index 1d79fe5..ea7547f 100644 --- a/lapis/scheduler.py +++ b/lapis/scheduler.py @@ -87,7 +87,7 @@ def access_wrapped(name, requested=True): Extracts the wrapped object's current quantity of a certain resource ( cores, memory, disk) - :param name: name of the reosurce that is to be accessed + :param name: name of the resource that is to be accessed :param requested: false if name is a resource of the drone, true if name is a resource requested by a job :return: value of respective resource @@ -455,7 +455,7 @@ class RankedAutoClusters(RankedClusters[DJ]): def __init__(self, quantization: Dict[str, HTCInt], ranking: Expression): """ - :param quantization: factors to convert resources into HTCondor scalings + :param quantization: factors to convert resources into HTCondor scaling :param ranking: prejobrank expression """ self._quantization = quantization diff --git a/lapis/storageelement.py b/lapis/storageelement.py index 2efdf0e..b7d94fe 100644 --- a/lapis/storageelement.py +++ b/lapis/storageelement.py @@ -69,7 +69,7 @@ async def remove(self, file: StoredFile, **kwargs): def find(self, file: RequestedFile, **kwargs) -> LookUpInformation: """ All files are contained in remote storage. Therefore no functionality - to determine whether the storage cotains a certain file is provided. + to determine whether the storage contains a certain file is provided. """ raise NotImplementedError From 03d4fdcc7cfeb4c3badc994cc92f5f81f2f8c14c Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Wed, 9 Dec 2020 19:11:48 +0100 Subject: [PATCH 596/648] added caching-based files to caching folder --- custom_simulate.py | 4 ++-- lapis/{ => caching}/cachealgorithm.py | 4 ++-- lapis/{ => caching}/connection.py | 8 ++++---- lapis/{ => caching}/files.py | 0 lapis/{ => caching}/monitoredpipe.py | 0 lapis/{ => caching}/storageelement.py | 4 ++-- lapis/cli/simulate.py | 2 +- lapis/drone.py | 2 +- lapis/interfaces/_storage.py | 2 +- lapis/monitor/caching.py | 5 ++--- lapis/pool.py | 2 +- lapis/pool_io/htcondor.py | 2 +- lapis/simulator.py | 2 +- lapis/storage_io/storage.py | 2 +- lapis/utilities/cache_cleanup_implementations.py | 2 +- lapis_tests/__init__.py | 2 +- lapis_tests/storage_io/test_storage.py | 2 +- lapis_tests/test_caching_hitrate_based.py | 9 ++++----- lapis_tests/test_job.py | 2 +- lapis_tests/test_remote_storage.py | 4 ++-- lapis_tests/test_storage_filebasedhitrate.py | 6 +++--- 21 files changed, 32 insertions(+), 34 deletions(-) rename lapis/{ => caching}/cachealgorithm.py (94%) rename lapis/{ => caching}/connection.py (97%) rename lapis/{ => caching}/files.py (100%) rename lapis/{ => caching}/monitoredpipe.py (100%) rename lapis/{ => caching}/storageelement.py (98%) diff --git a/custom_simulate.py b/custom_simulate.py index 6820b6a..2fd6a98 100644 --- a/custom_simulate.py +++ b/custom_simulate.py @@ -5,14 +5,14 @@ from cobald.monitor.format_json import JsonFormatter from cobald.monitor.format_line import LineProtocolFormatter -from lapis.connection import Connection +from lapis.caching.connection import Connection from lapis.drone import Drone from lapis.job_io.htcondor import htcondor_job_reader from lapis.pool import StaticPool from lapis.pool_io.htcondor import htcondor_pool_reader from lapis.job_io.swf import swf_job_reader -from lapis.storageelement import FileBasedHitrateStorage +from lapis.caching.storageelement import FileBasedHitrateStorage from lapis.storage_io.storage import ( storage_reader, storage_reader_filebased_hitrate_caching, diff --git a/lapis/cachealgorithm.py b/lapis/caching/cachealgorithm.py similarity index 94% rename from lapis/cachealgorithm.py rename to lapis/caching/cachealgorithm.py index caa0d66..20febe4 100644 --- a/lapis/cachealgorithm.py +++ b/lapis/caching/cachealgorithm.py @@ -1,7 +1,7 @@ from typing import Optional, Callable, Tuple -from lapis.files import RequestedFile, StoredFile -from lapis.storageelement import StorageElement +from lapis.caching.files import RequestedFile, StoredFile +from lapis.caching.storageelement import StorageElement from lapis.utilities.cache_cleanup_implementations import sort_files_by_cachedsince diff --git a/lapis/connection.py b/lapis/caching/connection.py similarity index 97% rename from lapis/connection.py rename to lapis/caching/connection.py index 1ad2605..49a6212 100644 --- a/lapis/connection.py +++ b/lapis/caching/connection.py @@ -2,17 +2,17 @@ from typing import Union, Optional from usim import Scope, time -from lapis.monitoredpipe import MonitoredPipe +from lapis.caching.monitoredpipe import MonitoredPipe -from lapis.cachealgorithm import ( +from lapis.caching.cachealgorithm import ( CacheAlgorithm, check_size, check_relevance, delete_oldest_few_used, ) -from lapis.storageelement import StorageElement, RemoteStorage -from lapis.files import RequestedFile, RequestedFile_HitrateBased +from lapis.caching.storageelement import StorageElement, RemoteStorage +from lapis.caching.files import RequestedFile, RequestedFile_HitrateBased from lapis.monitor import sampling_required from lapis.monitor.caching import HitrateInfo diff --git a/lapis/files.py b/lapis/caching/files.py similarity index 100% rename from lapis/files.py rename to lapis/caching/files.py diff --git a/lapis/monitoredpipe.py b/lapis/caching/monitoredpipe.py similarity index 100% rename from lapis/monitoredpipe.py rename to lapis/caching/monitoredpipe.py diff --git a/lapis/storageelement.py b/lapis/caching/storageelement.py similarity index 98% rename from lapis/storageelement.py rename to lapis/caching/storageelement.py index b7d94fe..58a1fbc 100644 --- a/lapis/storageelement.py +++ b/lapis/caching/storageelement.py @@ -1,10 +1,10 @@ from typing import Optional from usim import time, Resources, Scope -from lapis.monitoredpipe import MonitoredPipe +from lapis.caching.monitoredpipe import MonitoredPipe from lapis.monitor import sampling_required -from lapis.files import StoredFile, RequestedFile, RequestedFile_HitrateBased +from lapis.caching.files import StoredFile, RequestedFile, RequestedFile_HitrateBased from lapis.interfaces._storage import Storage, LookUpInformation import logging diff --git a/lapis/cli/simulate.py b/lapis/cli/simulate.py index 6c2e0cb..3f03efe 100644 --- a/lapis/cli/simulate.py +++ b/lapis/cli/simulate.py @@ -11,7 +11,7 @@ from lapis.pool import StaticPool, Pool from lapis.pool_io.htcondor import htcondor_pool_reader from lapis.job_io.swf import swf_job_reader -from lapis.storageelement import StorageElement, HitrateStorage, FileBasedHitrateStorage +from lapis.caching.storageelement import FileBasedHitrateStorage from lapis.storage_io.storage import storage_reader from lapis.scheduler import CondorJobScheduler diff --git a/lapis/drone.py b/lapis/drone.py index f5fc159..9c0aeab 100644 --- a/lapis/drone.py +++ b/lapis/drone.py @@ -4,7 +4,7 @@ from typing import Optional from lapis.job import Job -from lapis.connection import Connection +from lapis.caching.connection import Connection from lapis.monitor.duplicates import DroneStatusCaching diff --git a/lapis/interfaces/_storage.py b/lapis/interfaces/_storage.py index 0735b3f..ea6c8c6 100644 --- a/lapis/interfaces/_storage.py +++ b/lapis/interfaces/_storage.py @@ -2,7 +2,7 @@ from typing import NamedTuple -from lapis.files import RequestedFile, StoredFile +from lapis.caching.files import RequestedFile, StoredFile class LookUpInformation(NamedTuple): diff --git a/lapis/monitor/caching.py b/lapis/monitor/caching.py index baa65d6..ce807ea 100644 --- a/lapis/monitor/caching.py +++ b/lapis/monitor/caching.py @@ -10,11 +10,10 @@ LoggingUDPSocketHandler, SIMULATION_START, ) -from lapis.storageelement import StorageElement -from lapis.monitoredpipe import MonitoredPipe, MonitoredPipeInfo +from lapis.caching.storageelement import StorageElement +from lapis.caching.monitoredpipe import MonitoredPipe, MonitoredPipeInfo import time as pytime -from usim import time class HitrateInfo(NamedTuple): diff --git a/lapis/pool.py b/lapis/pool.py index 74bbc9b..ce95191 100644 --- a/lapis/pool.py +++ b/lapis/pool.py @@ -3,7 +3,7 @@ from cobald import interfaces from usim import eternity, Scope, interval -from lapis.connection import Connection +from lapis.caching.connection import Connection from .drone import Drone diff --git a/lapis/pool_io/htcondor.py b/lapis/pool_io/htcondor.py index d60e92e..6fb3222 100644 --- a/lapis/pool_io/htcondor.py +++ b/lapis/pool_io/htcondor.py @@ -3,7 +3,7 @@ from typing import Callable -from lapis.connection import Connection +from lapis.caching.connection import Connection from ..pool import Pool diff --git a/lapis/simulator.py b/lapis/simulator.py index af18cc0..8037dd3 100644 --- a/lapis/simulator.py +++ b/lapis/simulator.py @@ -11,7 +11,7 @@ import lapis.monitor as monitor from lapis.drone import Drone from lapis.job import job_to_queue_scheduler -from lapis.connection import Connection +from lapis.caching.connection import Connection from lapis.monitor.caching import ( storage_status, pipe_status, diff --git a/lapis/storage_io/storage.py b/lapis/storage_io/storage.py index 1d1fd48..7d6906d 100644 --- a/lapis/storage_io/storage.py +++ b/lapis/storage_io/storage.py @@ -1,7 +1,7 @@ import csv from functools import partial -from lapis.files import StoredFile +from lapis.caching.files import StoredFile def storage_reader( diff --git a/lapis/utilities/cache_cleanup_implementations.py b/lapis/utilities/cache_cleanup_implementations.py index 0ddf493..9892f6a 100644 --- a/lapis/utilities/cache_cleanup_implementations.py +++ b/lapis/utilities/cache_cleanup_implementations.py @@ -1,6 +1,6 @@ from typing import List -from lapis.files import StoredFile +from lapis.caching.files import StoredFile def sort_files_by_cachedsince(stored_files: set) -> List[StoredFile]: diff --git a/lapis_tests/__init__.py b/lapis_tests/__init__.py index 7a4b971..332f6ec 100644 --- a/lapis_tests/__init__.py +++ b/lapis_tests/__init__.py @@ -5,7 +5,7 @@ from lapis.drone import Drone from lapis.job import Job -from lapis.connection import Connection +from lapis.caching.connection import Connection class UnfinishedTest(RuntimeError): diff --git a/lapis_tests/storage_io/test_storage.py b/lapis_tests/storage_io/test_storage.py index 559014f..c1bd0e8 100644 --- a/lapis_tests/storage_io/test_storage.py +++ b/lapis_tests/storage_io/test_storage.py @@ -1,6 +1,6 @@ from tempfile import NamedTemporaryFile -from lapis.storageelement import StorageElement +from lapis.caching.storageelement import StorageElement from lapis.storage_io.storage import storage_reader diff --git a/lapis_tests/test_caching_hitrate_based.py b/lapis_tests/test_caching_hitrate_based.py index df48551..5a2e0dd 100644 --- a/lapis_tests/test_caching_hitrate_based.py +++ b/lapis_tests/test_caching_hitrate_based.py @@ -1,18 +1,17 @@ from usim import time from tempfile import NamedTemporaryFile import json -from functools import partial from lapis_tests import via_usim, DummyDrone, DummyJob -from lapis.connection import Connection -from lapis.storageelement import FileBasedHitrateStorage, HitrateStorage +from lapis.caching.connection import Connection +from lapis.caching.storageelement import FileBasedHitrateStorage, HitrateStorage from lapis.storage_io.storage import storage_reader -from lapis.files import RequestedFile +from lapis.caching.files import RequestedFile from lapis.simulator import Simulator from lapis.job_io.htcondor import htcondor_job_reader from lapis.pool import StaticPool from lapis.pool_io.htcondor import htcondor_pool_reader -from lapis.scheduler import CondorJobScheduler, CondorClassadJobScheduler +from lapis.scheduler import CondorClassadJobScheduler conversion_GB_to_B = 1000 * 1000 * 1000 diff --git a/lapis_tests/test_job.py b/lapis_tests/test_job.py index 9c08001..dd985ce 100644 --- a/lapis_tests/test_job.py +++ b/lapis_tests/test_job.py @@ -5,7 +5,7 @@ from lapis.job import Job from lapis_tests import via_usim, DummyScheduler, DummyDrone -from lapis.connection import Connection +from lapis.caching.connection import Connection class TestJob(object): diff --git a/lapis_tests/test_remote_storage.py b/lapis_tests/test_remote_storage.py index 5930c4a..ae00b88 100644 --- a/lapis_tests/test_remote_storage.py +++ b/lapis_tests/test_remote_storage.py @@ -1,8 +1,8 @@ from usim import time -from lapis.storageelement import RemoteStorage +from lapis.caching.storageelement import RemoteStorage from lapis_tests import via_usim -from lapis.files import RequestedFile +from lapis.caching.files import RequestedFile class TestRemoteStorage(object): diff --git a/lapis_tests/test_storage_filebasedhitrate.py b/lapis_tests/test_storage_filebasedhitrate.py index 2861811..4c5c8ab 100644 --- a/lapis_tests/test_storage_filebasedhitrate.py +++ b/lapis_tests/test_storage_filebasedhitrate.py @@ -1,7 +1,7 @@ -from lapis.storageelement import FileBasedHitrateStorage +from lapis.caching.storageelement import FileBasedHitrateStorage from lapis_tests import via_usim, DummyJob -from lapis.files import RequestedFile_HitrateBased -from lapis.storageelement import LookUpInformation +from lapis.caching.files import RequestedFile_HitrateBased +from lapis.caching.storageelement import LookUpInformation from usim import time From a25d4e9f7726d1847d4fcd5443fc9edd7fd501fd Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Wed, 9 Dec 2020 19:59:06 +0100 Subject: [PATCH 597/648] blackened files --- .coveragerc | 2 +- .gitignore | 1 - custom_simulate.py | 2 - docs/Makefile | 2 +- docs/conf.py | 7 +- docs/source/changelog.rst | 1 - docs/source/topics/autodoc.rst | 2 +- lapis/caching/connection.py | 5 +- lapis/caching/files.py | 2 + lapis/caching/monitoredpipe.py | 3 +- lapis/caching/storageelement.py | 9 +- lapis/cli/simulate.py | 2 +- lapis/drone.py | 13 +-- lapis/interfaces/_storage.py | 1 + lapis/job.py | 6 +- lapis/scheduler.py | 4 + lapis/scheduler_withoutClassAds.py | 12 --- lapis_tests/__init__.py | 1 - lapis_tests/data/htcondor_pools.csv | 2 +- lapis_tests/test_caching_hitrate_based.py | 59 ++++++------ lapis_tests/test_job_caching.py | 94 ++++++++++++-------- lapis_tests/test_remote_storage.py | 6 +- lapis_tests/test_storage_filebasedhitrate.py | 22 +++-- 23 files changed, 136 insertions(+), 122 deletions(-) delete mode 100644 lapis/scheduler_withoutClassAds.py diff --git a/.coveragerc b/.coveragerc index 217073f..836ee80 100644 --- a/.coveragerc +++ b/.coveragerc @@ -18,4 +18,4 @@ exclude_lines = raise NotImplementedError return NotImplemented if __name__ == "__main__" - if __name__ == '__main__' \ No newline at end of file + if __name__ == '__main__' diff --git a/.gitignore b/.gitignore index 6d07713..7bc8879 100644 --- a/.gitignore +++ b/.gitignore @@ -207,4 +207,3 @@ Icon Network Trash Folder Temporary Items .apdisk - diff --git a/custom_simulate.py b/custom_simulate.py index 2fd6a98..a3e19d0 100644 --- a/custom_simulate.py +++ b/custom_simulate.py @@ -161,5 +161,3 @@ def ini_and_run( # run simulation simulator.run(until=until) - - diff --git a/docs/Makefile b/docs/Makefile index 298ea9e..5128596 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -16,4 +16,4 @@ help: # Catch-all target: route all unknown targets to Sphinx using the new # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). %: Makefile - @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) \ No newline at end of file + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/docs/conf.py b/docs/conf.py index 63211be..53089a9 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -56,12 +56,9 @@ ] -autodoc_default_options = { - 'members': True, - 'private-members':True -} +autodoc_default_options = {"members": True, "private-members": True} -autodoc_member_order = 'groupwise' +autodoc_member_order = "groupwise" autosummary_generate = True diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst index 8ea72ab..048ffab 100644 --- a/docs/source/changelog.rst +++ b/docs/source/changelog.rst @@ -65,4 +65,3 @@ Version [0.1.1] - 2019-10-24 * **[Fixed]** Calculation of used and requested resource ratio * **[Fixed]** StopIteration handling by Job Generator * **[Fixed]** Importing of SWF files - diff --git a/docs/source/topics/autodoc.rst b/docs/source/topics/autodoc.rst index 90052dc..82ea68a 100644 --- a/docs/source/topics/autodoc.rst +++ b/docs/source/topics/autodoc.rst @@ -17,4 +17,4 @@ Detailed documentation of all relevant modules .. automodule:: lapis.simulator -.. automodule:: lapis.monitoredpipe \ No newline at end of file +.. automodule:: lapis.monitoredpipe diff --git a/lapis/caching/connection.py b/lapis/caching/connection.py index 49a6212..f16e08a 100644 --- a/lapis/caching/connection.py +++ b/lapis/caching/connection.py @@ -57,7 +57,7 @@ def __init__(self, throughput, filebased_caching=True): file, storage ), ) - """cache behavior file based caching, contains both caching and deletion + """cache behavior file based caching, contains both caching and deletion strategy""" self._filebased_caching = filebased_caching """flag, true if file based caching is current caching mode""" @@ -67,6 +67,7 @@ async def run_pipemonitoring(self): Starts monitoring of pipe objects, should be called during simulator/monitoring initialization. """ + async def report_load_to_monitoring(pipe: MonitoredPipe): async for information in pipe.load(): await sampling_required.put(information) @@ -191,7 +192,7 @@ async def transfer_files(self, drone, requested_files: dict, job_repr): except ZeroDivisionError: hitrate = 0 provides_file = 0 - #TODO:: In which cases is hitrate not defined and how can they be covered? I + # TODO:: In which cases is hitrate not defined and how can they be covered? I # think that in this case this code should not be reached but I'm unsure # right now diff --git a/lapis/caching/files.py b/lapis/caching/files.py index e920298..bad5f45 100644 --- a/lapis/caching/files.py +++ b/lapis/caching/files.py @@ -61,6 +61,7 @@ class RequestedFile(NamedTuple): """ Representation of a requested file """ + filename: str """name of the file""" filesize: Optional[int] = None @@ -88,6 +89,7 @@ class RequestedFile_HitrateBased(NamedTuple): The cachehitrate flag is somewhat messed up currently. **Its use should be reworked when remodeling the connection module.** """ + filename: str """name of the requested file""" filesize: int diff --git a/lapis/caching/monitoredpipe.py b/lapis/caching/monitoredpipe.py index 640aee6..f5827df 100644 --- a/lapis/caching/monitoredpipe.py +++ b/lapis/caching/monitoredpipe.py @@ -15,12 +15,13 @@ class MonitoredPipeInfo(NamedTuple): class MonitoredPipe(Pipe): """Implementation of the usim pipe object that can be monitored""" + def __init__(self, throughput: float): super().__init__(throughput) self._monitor = Notification() self._monitor_buffers: Dict[Any, Deque[MonitoredPipeInfo]] = {} self.storage = None - """storage object the pipe simulates the network connection for, for monitoring + """storage object the pipe simulates the network connection for, for monitoring purposes""" self.transferred_data = 0 """total amount of data transferred by the pipe, for monitoring purposes""" diff --git a/lapis/caching/storageelement.py b/lapis/caching/storageelement.py index 58a1fbc..b957950 100644 --- a/lapis/caching/storageelement.py +++ b/lapis/caching/storageelement.py @@ -19,6 +19,7 @@ class RemoteStorage(Storage): storages in the simulation because resource pools may have differing network connections. """ + # TODO:: ensure that there can be multiple remote storages in the simulation def __init__(self, throughput: float): """ @@ -102,7 +103,7 @@ def __init__( throughput_limit: int = 10 * 1000 * 1000 * 1000, files: Optional[dict] = None, deletion_duration: float = 5, - update_duration: float = 1 + update_duration: float = 1, ): """ Intialization of a storage element object. @@ -122,7 +123,7 @@ def __init__( self.name = name """identification of the storage""" self.sitename = sitename - """identifier, drones with the same sitename can access this + """identifier, drones with the same sitename can access this storage""" self.deletion_duration = deletion_duration """amount of time passing while a file is deleted from the storage""" @@ -138,7 +139,7 @@ def __init__( """amount of storage space that is currently in use""" self.connection = MonitoredPipe(throughput_limit) """Pipe representing the network connection to this storage - **Namespace problem between connection module and this pipe called + **Namespace problem between connection module and this pipe called connection**""" self.connection.storage = repr(self) @@ -247,6 +248,7 @@ class HitrateStorage(StorageElement): 1 - `_hitrate` percent of the file are transferred from the remote storage associated to the hitrate storage. """ + def __init__( self, hitrate, @@ -328,6 +330,7 @@ class FileBasedHitrateStorage(StorageElement): #TODO: this storage object has become very intermingled with the connection module and should be tidied up and restructured! """ + def __init__( self, name: Optional[str] = None, diff --git a/lapis/cli/simulate.py b/lapis/cli/simulate.py index 3f03efe..3f9d216 100644 --- a/lapis/cli/simulate.py +++ b/lapis/cli/simulate.py @@ -31,7 +31,7 @@ storage_import_mapper = {"standard": storage_reader} -"""Simulation CLI, pay attention to the fact that the random seed is currently set to a +"""Simulation CLI, pay attention to the fact that the random seed is currently set to a fixed value""" diff --git a/lapis/drone.py b/lapis/drone.py index 9c0aeab..5da3d98 100644 --- a/lapis/drone.py +++ b/lapis/drone.py @@ -17,6 +17,7 @@ class Drone(interfaces.Pool): """ Represents worker nodes in the simulation. """ + def __init__( self, scheduler, @@ -48,16 +49,16 @@ def __init__( self.connection = connection """connection object that holds remote connection and handles file transfers""" self.sitename = sitename - """identifies the site the drone belongs to, used to determine which caches a + """identifies the site the drone belongs to, used to determine which caches a drone can use """ self.pool_resources = pool_resources """dict stating the drone's resources""" self.resources = Capacities(**pool_resources) - """available resources, based on the amount of resources requested by + """available resources, based on the amount of resources requested by jobs running on the drone """ # shadowing requested resources to determine jobs to be killed self.used_resources = Capacities(**pool_resources) - """available resources, based on the amount of resources actually used by + """available resources, based on the amount of resources actually used by jobs running on the drone""" if ignore_resources: @@ -82,11 +83,11 @@ def __init__( # caching-related self.jobs_with_cached_data = 0 - """amount of jobs that currently run on the drone and that could read from + """amount of jobs that currently run on the drone and that could read from the cache""" self.cached_data = 0 - """used during scheduling, calculated for each job, is assigned the - expectation value for the amount of cached data that is available to the + """used during scheduling, calculated for each job, is assigned the + expectation value for the amount of cached data that is available to the drone""" def empty(self): diff --git a/lapis/interfaces/_storage.py b/lapis/interfaces/_storage.py index ea6c8c6..32e3742 100644 --- a/lapis/interfaces/_storage.py +++ b/lapis/interfaces/_storage.py @@ -15,6 +15,7 @@ class Storage(metaclass=abc.ABCMeta): This class represents the basic structures of all representations of storage in this simulation. """ + @property @abc.abstractmethod def size(self) -> int: diff --git a/lapis/job.py b/lapis/job.py index 779c9d7..8f25538 100644 --- a/lapis/job.py +++ b/lapis/job.py @@ -144,7 +144,7 @@ def __init__( self._success: Optional[bool] = None """flag indicating whether the job was completed successfully""" self.calculation_efficiency = calculation_efficiency - """efficiency of the job's calculations, can be < 1.0 to account for + """efficiency of the job's calculations, can be < 1.0 to account for programmatical insufficiencies""" # caching-related self.requested_inputfiles = resources.pop("inputfiles", None) @@ -154,7 +154,7 @@ def __init__( self._read_from_cache = 0 """flag indicating whether the job read from the cache""" self._cached_data = 0 - """expectation value for the amount of data that was read from a cache by + """expectation value for the amount of data that was read from a cache by this job""" self._original_walltime = self.walltime """stores the jobs original walltime as a reference""" @@ -203,7 +203,7 @@ def __init__( self.cache_probability = 0 self.failed_matches = 0 - """number of times the job entered the matchmaking process but was not + """number of times the job entered the matchmaking process but was not scheduled to a drone""" @property diff --git a/lapis/scheduler.py b/lapis/scheduler.py index ea7547f..1ad5ca4 100644 --- a/lapis/scheduler.py +++ b/lapis/scheduler.py @@ -25,6 +25,7 @@ class JobQueue(list): pass + quantization_defaults = { "memory": HTCInt(128 * 1024 * 1024), "disk": HTCInt(1024 * 1024), @@ -82,6 +83,7 @@ def __getitem__(self, item): :param item: name of a quantity in the classad expression :return: current value of this item """ + def access_wrapped(name, requested=True): """ Extracts the wrapped object's current quantity of a certain resource ( @@ -230,6 +232,7 @@ async def job_finished(self, job): """ raise NotImplementedError + class CondorJobScheduler(JobScheduler): """ Goal of the htcondor job scheduler is to have a scheduler that somehow @@ -797,6 +800,7 @@ def _match_job( highest (prejobrank, jobrank) whose requirements are also compatible with the job is returned as best match. """ + def debug_evaluate(expr, my, target=None): """ Reimplementation of the classad packages evaluate function. Having it diff --git a/lapis/scheduler_withoutClassAds.py b/lapis/scheduler_withoutClassAds.py deleted file mode 100644 index 9441c77..0000000 --- a/lapis/scheduler_withoutClassAds.py +++ /dev/null @@ -1,12 +0,0 @@ -from typing import Dict - -from usim import Scope, interval, Resources - -from lapis.drone import Drone -from lapis.monitor import sampling_required -from lapis.monitor.duplicates import UserDemand - -from lapis.scheduler import JobScheduler - - - diff --git a/lapis_tests/__init__.py b/lapis_tests/__init__.py index 332f6ec..717dd2c 100644 --- a/lapis_tests/__init__.py +++ b/lapis_tests/__init__.py @@ -82,6 +82,5 @@ def __init__(self, throughput: Optional[float] = None): class DummyJob: - def __init__(self, reads_from_cache=False): self.reads_from_cache = reads_from_cache diff --git a/lapis_tests/data/htcondor_pools.csv b/lapis_tests/data/htcondor_pools.csv index 82ffa21..bb2b433 100644 --- a/lapis_tests/data/htcondor_pools.csv +++ b/lapis_tests/data/htcondor_pools.csv @@ -2,4 +2,4 @@ 2 2 224400.0 8000 2 2 223100.0 8000 1 8 196300.0 32200 - 1 4 29700.0 8000 \ No newline at end of file + 1 4 29700.0 8000 diff --git a/lapis_tests/test_caching_hitrate_based.py b/lapis_tests/test_caching_hitrate_based.py index 5a2e0dd..dea8bd6 100644 --- a/lapis_tests/test_caching_hitrate_based.py +++ b/lapis_tests/test_caching_hitrate_based.py @@ -15,6 +15,7 @@ conversion_GB_to_B = 1000 * 1000 * 1000 + class TestHitrateCaching(object): def test_hitratestorage(self): size = 1000 @@ -53,8 +54,9 @@ async def test_determine_inputfile_source(self): async def test_stream_file(self): throughput = 10 size = 1000 - requested_file = RequestedFile(filename="testfile", - filesize=100 * conversion_GB_to_B) + requested_file = RequestedFile( + filename="testfile", filesize=100 * conversion_GB_to_B + ) hitratestorage = HitrateStorage(hitrate=0.5, size=size, files={}) # does not transfer from cache but from remote storage as there are no files # in the HitrateStorage @@ -70,8 +72,9 @@ async def test_single_transfer_files(self): size = 1000 drone = DummyDrone(throughput) job = DummyJob(True) - requested_files = dict(test=dict(usedsize=100 * conversion_GB_to_B, - hitrates={drone.sitename: 1.0})) + requested_files = dict( + test=dict(usedsize=100 * conversion_GB_to_B, hitrates={drone.sitename: 1.0}) + ) hitratestorage = HitrateStorage(hitrate=0.5, size=size, files={}) # does not transfer from cache but from remote storage as there are no files # in the HitrateStorage @@ -89,10 +92,14 @@ async def test_simultaneous_transfer(self): size = 1000 drone = DummyDrone(throughput) job = DummyJob(True) - requested_files = dict(test1=dict(usedsize=100 * conversion_GB_to_B, - hitrates={drone.sitename: 1.0}), - test2=dict(usedsize=200 * conversion_GB_to_B, hitrates={ - drone.sitename: 1.0})) + requested_files = dict( + test1=dict( + usedsize=100 * conversion_GB_to_B, hitrates={drone.sitename: 1.0} + ), + test2=dict( + usedsize=200 * conversion_GB_to_B, hitrates={drone.sitename: 1.0} + ), + ) hitratestorage = HitrateStorage(hitrate=0.5, size=size, files={}) drone.connection.add_storage_element(hitratestorage) # does not transfer from cache but from remote storage as there are no files @@ -105,9 +112,9 @@ async def test_simultaneous_transfer(self): @via_usim async def test_full_simulation_with_hitratebased_caching(self): - with NamedTemporaryFile(suffix=".csv") as machine_config, \ - NamedTemporaryFile(suffix=".csv") as storage_config, \ - NamedTemporaryFile(suffix=".json") as job_config: + with NamedTemporaryFile(suffix=".csv") as machine_config, NamedTemporaryFile( + suffix=".csv" + ) as storage_config, NamedTemporaryFile(suffix=".json") as job_config: with open(machine_config.name, "w") as write_stream: write_stream.write( "TotalSlotCPUs TotalSlotDisk TotalSlotMemory Count sitename \n" @@ -129,20 +136,16 @@ async def test_full_simulation_with_hitratebased_caching(self): "RemoteUserCpu": 2, "Inputfiles": { "a.root": { - "filesize": 5, - "usedsize": 5, - "hitrates": { - "mysite": 1.0 - } - }, + "filesize": 5, + "usedsize": 5, + "hitrates": {"mysite": 1.0}, + }, "b.root": { - "filesize": 5, - "usedsize": 5, - "hitrates": { - "mysite": 0.0 - } - } - } + "filesize": 5, + "usedsize": 5, + "hitrates": {"mysite": 0.0}, + }, + }, } ] json.dump(job_description, write_stream) @@ -159,11 +162,12 @@ async def test_full_simulation_with_hitratebased_caching(self): simulator = Simulator() simulator.create_job_generator( - job_input=job_input, - job_reader=htcondor_job_reader + job_input=job_input, job_reader=htcondor_job_reader ) simulator.create_scheduler(scheduler_type=CondorClassadJobScheduler) - simulator.create_connection_module(remote_throughput=0.1, filebased_caching=False) + simulator.create_connection_module( + remote_throughput=0.1, filebased_caching=False + ) simulator.create_storage( storage_input=storage_input, storage_content_input=storage_content_input, @@ -179,4 +183,3 @@ async def test_full_simulation_with_hitratebased_caching(self): simulator.enable_monitoring() simulator.run() assert 180 == simulator.duration - diff --git a/lapis_tests/test_job_caching.py b/lapis_tests/test_job_caching.py index 8caac2e..95bf78e 100644 --- a/lapis_tests/test_job_caching.py +++ b/lapis_tests/test_job_caching.py @@ -8,31 +8,38 @@ class TestJobCaching(object): @via_usim async def test_calculation_time(self): - self.job = Job(resources={"walltime": 60}, - used_resources={"walltime": 10, "cores": 0.7}) + self.job = Job( + resources={"walltime": 60}, used_resources={"walltime": 10, "cores": 0.7} + ) self.job.drone = DummyDrone(1) starttime = time.now await self.job._calculate() assert time.now - starttime == 10 - self.job = Job(resources={"walltime": 60, "inputfiles": {"file"}}, - used_resources={"walltime": 10, "cores": 0.7}) + self.job = Job( + resources={"walltime": 60, "inputfiles": {"file"}}, + used_resources={"walltime": 10, "cores": 0.7}, + ) self.job.drone = DummyDrone(1) starttime = time.now await self.job._calculate() assert time.now - starttime == 7 - self.job = Job(resources={"walltime": 60, "inputfiles": {"file"}}, - used_resources={"walltime": 10, "cores": 0.7}, - calculation_efficiency=0.5) + self.job = Job( + resources={"walltime": 60, "inputfiles": {"file"}}, + used_resources={"walltime": 10, "cores": 0.7}, + calculation_efficiency=0.5, + ) self.job.drone = DummyDrone(1) starttime = time.now await self.job._calculate() assert time.now - starttime == 14 - self.job = Job(resources={"walltime": 60, "inputfiles": {"file"}}, - used_resources={"walltime": 10}, - calculation_efficiency=0.5) + self.job = Job( + resources={"walltime": 60, "inputfiles": {"file"}}, + used_resources={"walltime": 10}, + calculation_efficiency=0.5, + ) self.job.drone = DummyDrone(1) starttime = time.now await self.job._calculate() @@ -42,54 +49,65 @@ async def test_calculation_time(self): async def test_transfer_time(self): conversion_GB_to_B = 1000 * 1000 * 1000 drone = DummyDrone(1) - self.job = Job(resources={"walltime": 60, - "inputfiles": {"file": {"usedsize": 20 *conversion_GB_to_B}}}, - used_resources={"walltime": 10, - "inputfiles": { - "file": {"usedsize": 20 * conversion_GB_to_B, - "hitrates": {}}} - }, - calculation_efficiency=1.0) + self.job = Job( + resources={ + "walltime": 60, + "inputfiles": {"file": {"usedsize": 20 * conversion_GB_to_B}}, + }, + used_resources={ + "walltime": 10, + "inputfiles": { + "file": {"usedsize": 20 * conversion_GB_to_B, "hitrates": {}} + }, + }, + calculation_efficiency=1.0, + ) self.job.drone = drone starttime = time.now await self.job._transfer_inputfiles() assert time.now - starttime == 20 - self.job = Job(resources={"walltime": 60}, - used_resources={"walltime": 10}, - calculation_efficiency=1.0) + self.job = Job( + resources={"walltime": 60}, + used_resources={"walltime": 10}, + calculation_efficiency=1.0, + ) self.job.drone = drone starttime = time.now await self.job._transfer_inputfiles() assert time.now - starttime == 0 - self.job = Job(resources={"walltime": 60, - "inputfiles": { - "file": {"usedsize": 20 *conversion_GB_to_B}}}, - used_resources={"walltime": 10}, - calculation_efficiency=1.0) + self.job = Job( + resources={ + "walltime": 60, + "inputfiles": {"file": {"usedsize": 20 * conversion_GB_to_B}}, + }, + used_resources={"walltime": 10}, + calculation_efficiency=1.0, + ) self.job.drone = drone starttime = time.now await self.job._transfer_inputfiles() assert time.now - starttime == 0 - self.job = Job(resources={"walltime": 60, - "inputfiles": { - "file": {"usedsize": 20 * conversion_GB_to_B}}}, - used_resources={"walltime": 10, - "inputfiles": { - "file": {"usedsize": 20 * - conversion_GB_to_B, - "hitrates": {}}, - } - }, - calculation_efficiency=1.0) + self.job = Job( + resources={ + "walltime": 60, + "inputfiles": {"file": {"usedsize": 20 * conversion_GB_to_B}}, + }, + used_resources={ + "walltime": 10, + "inputfiles": { + "file": {"usedsize": 20 * conversion_GB_to_B, "hitrates": {}} + }, + }, + calculation_efficiency=1.0, + ) self.job.drone = drone starttime = time.now await self.job._transfer_inputfiles() assert time.now - starttime == 20 - diff --git a/lapis_tests/test_remote_storage.py b/lapis_tests/test_remote_storage.py index ae00b88..8d01561 100644 --- a/lapis_tests/test_remote_storage.py +++ b/lapis_tests/test_remote_storage.py @@ -6,7 +6,6 @@ class TestRemoteStorage(object): - def test_throughput(self): remote_storage = RemoteStorage(2.0) assert remote_storage.connection.throughput == 2000000000 @@ -20,9 +19,6 @@ def test_size(self): @via_usim async def test_transfer(self): remote_storage = RemoteStorage(1.0) - requested_file = RequestedFile("testfile", 10*1000*1000*1000) + requested_file = RequestedFile("testfile", 10 * 1000 * 1000 * 1000) await remote_storage.transfer(requested_file) assert time.now == 10 - - - diff --git a/lapis_tests/test_storage_filebasedhitrate.py b/lapis_tests/test_storage_filebasedhitrate.py index 4c5c8ab..f15251c 100644 --- a/lapis_tests/test_storage_filebasedhitrate.py +++ b/lapis_tests/test_storage_filebasedhitrate.py @@ -7,11 +7,12 @@ import pytest -class TestFileBasedHitrateStorag(): +class TestFileBasedHitrateStorag: def test_storage_initialization(self): - filebasedhitratestorage = FileBasedHitrateStorage(name="name", sitename="site", - size=200, throughput_limit=1) + filebasedhitratestorage = FileBasedHitrateStorage( + name="name", sitename="site", size=200, throughput_limit=1 + ) assert filebasedhitratestorage.files == {} assert filebasedhitratestorage.name == "name" assert filebasedhitratestorage.sitename == "site" @@ -23,8 +24,9 @@ def test_storage_initialization(self): @via_usim async def test_transfer(self): - filebasedhitratestorage = FileBasedHitrateStorage(name="name", sitename="site", - size=200, throughput_limit=1) + filebasedhitratestorage = FileBasedHitrateStorage( + name="name", sitename="site", size=200, throughput_limit=1 + ) requestedFile = RequestedFile_HitrateBased("filename", 20, 1) await filebasedhitratestorage.transfer(requestedFile, DummyJob()) assert time.now == 20 @@ -34,16 +36,18 @@ async def test_transfer(self): await filebasedhitratestorage.transfer(requestedFile, DummyJob()) def test_find_file_in_storage(self): - filebasedhitratestorage = FileBasedHitrateStorage(name="name", sitename="site", - size=200, throughput_limit=1) + filebasedhitratestorage = FileBasedHitrateStorage( + name="name", sitename="site", size=200, throughput_limit=1 + ) requestedFile = RequestedFile_HitrateBased("filename", 20, 1) foundFile = LookUpInformation(20, filebasedhitratestorage) assert filebasedhitratestorage.find(requestedFile) == foundFile def test_modification_of_stored_files(self): - filebasedhitratestorage = FileBasedHitrateStorage(name="name", sitename="site", - size=200, throughput_limit=1) + filebasedhitratestorage = FileBasedHitrateStorage( + name="name", sitename="site", size=200, throughput_limit=1 + ) requestedFile = RequestedFile_HitrateBased("filename", 20, 1) filebasedhitratestorage.add(requestedFile) From 6c02cf162b4b12965999a067e26d906ff17a2e58 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Wed, 9 Dec 2020 20:18:00 +0100 Subject: [PATCH 598/648] fixed flake8 issues --- lapis/caching/connection.py | 2 +- lapis/caching/monitoredpipe.py | 9 ++++++--- lapis/caching/storageelement.py | 6 ++++-- lapis/drone.py | 9 ++++++--- lapis/job.py | 8 ++++---- lapis/scheduler.py | 14 +++++++------- 6 files changed, 28 insertions(+), 20 deletions(-) diff --git a/lapis/caching/connection.py b/lapis/caching/connection.py index f16e08a..6324b1b 100644 --- a/lapis/caching/connection.py +++ b/lapis/caching/connection.py @@ -74,7 +74,7 @@ async def report_load_to_monitoring(pipe: MonitoredPipe): async with Scope() as scope: scope.do(report_load_to_monitoring(self.remote_connection.connection)) - for storage_key, storage_list in self.storages.items(): + for _, storage_list in self.storages.items(): for storage in storage_list: scope.do(report_load_to_monitoring(storage.connection)) diff --git a/lapis/caching/monitoredpipe.py b/lapis/caching/monitoredpipe.py index f5827df..12e5531 100644 --- a/lapis/caching/monitoredpipe.py +++ b/lapis/caching/monitoredpipe.py @@ -35,7 +35,8 @@ async def report_load(pipe: MonitoredPipe): print( f'{time.now:6.0f}:' f'{event.requested_throughput} \t' - f'[{event.requested_throughput / event.available_throughput * 100:03.0f}%]' + f'[{event.requested_throughput / event.available_throughput' + f'* 100:03.0f}%]' ) """ await instant @@ -84,10 +85,12 @@ def __repr__(self): async def report_load(pipe: MonitoredPipe): async for event in pipe.load(): + requested_tp = event.requested_throughput + available_tp = event.available_throughput print( f"{time.now:6.0f}:" - f"{event.requested_throughput} \t" - f"[{event.requested_throughput / event.available_throughput * 100:03.0f}%]" + f"{requested_tp} \t" + f"[{requested_tp / available_tp * 100:03.0f}%]" ) async def perform_load(pipe: MonitoredPipe, delay, amount): diff --git a/lapis/caching/storageelement.py b/lapis/caching/storageelement.py index b957950..2d9c3aa 100644 --- a/lapis/caching/storageelement.py +++ b/lapis/caching/storageelement.py @@ -244,7 +244,8 @@ class HitrateStorage(StorageElement): Simplified storage object, used to simulate a simplified form of hitrate based caching. No explicit list of stored files is kept. Instead, it is assumed that a fraction `_hitrate` of all files is stored. Every time a file is requested from - this kind of storage, `_hitrate` percent of the file are found on and transferred from this storage. + this kind of storage, `_hitrate` percent of the file are found on and transferred + from this storage. 1 - `_hitrate` percent of the file are transferred from the remote storage associated to the hitrate storage. """ @@ -328,7 +329,8 @@ class FileBasedHitrateStorage(StorageElement): the connection module's file transfer functionality. The definition of the storage objects size is currently irrelevant. - #TODO: this storage object has become very intermingled with the connection module and should be tidied up and restructured! + # TODO: this storage object has become very intermingled with the connection + module and should be tidied up and restructured! """ def __init__( diff --git a/lapis/drone.py b/lapis/drone.py index 5da3d98..eca576f 100644 --- a/lapis/drone.py +++ b/lapis/drone.py @@ -40,8 +40,10 @@ def __init__( start up and it's registration at the scheduler :param ignore_resources: dict of the resource keys that are ignored, e.g. "disk" :param sitename: identifier, used to determine which caches a drone can use - :param connection: connection object that holds remote connection and handles file transfers - :param empty: callable that determines whether the drone is currently running any jobs + :param connection: connection object that holds remote connection and handles + file transfers + :param empty: callable that determines whether the drone is currently running + any jobs """ super(Drone, self).__init__() self.scheduler = scheduler @@ -208,7 +210,8 @@ async def schedule_job(self, job: Job, kill: bool = False): A job is scheduled to a drone by putting it in the drone's job queue. :param job: job that was matched to the drone - :param kill: flag, if true jobs can be killed if they use more resources than they requested + :param kill: flag, if true jobs can be killed if they use more resources + than they requested """ await self._job_queue.put((job, kill)) diff --git a/lapis/job.py b/lapis/job.py index 8f25538..47a31a7 100644 --- a/lapis/job.py +++ b/lapis/job.py @@ -231,10 +231,10 @@ async def _calculate(self): Determines a jobs calculation time based on the jobs CPU time and a calculation efficiency representing inefficient programming. - If a job contains input files and the drone the job runs on has a defined remote - connection (throughput < Inf) the calculation time is given by job's CPU time - divided by a configurable `calculation_efficiency` that can be set != 1, e.g. to - account for programmatic inefficiencies. + If a job contains input files and the drone the job runs on has a defined + remote connection (throughput < Inf) the calculation time is given by + job's CPU time divided by a configurable `calculation_efficiency` that + can be set != 1, e.g. to account for programmatic inefficiencies. Else, the calculation time remains equal to the job's original `walltime`. """ diff --git a/lapis/scheduler.py b/lapis/scheduler.py index 1ad5ca4..f555a7b 100644 --- a/lapis/scheduler.py +++ b/lapis/scheduler.py @@ -575,7 +575,7 @@ def cluster_groups(self) -> Iterator[List[Set[WrappedClassAd[Drone]]]]: yield group def lookup(self, job: Job): - for ranked_key, drones in self._clusters.items(): + for _, drones in self._clusters.items(): for drone in drones: drone._wrapped.look_up_cached_data(job) @@ -651,7 +651,7 @@ def cluster_groups(self) -> Iterator[List[Set[WrappedClassAd[Drone]]]]: yield [{item} for item in drones] def lookup(self, job: Job): - for ranked_key, drones in self._clusters.items(): + for _, drones in self._clusters.items(): for drone in drones: drone._wrapped.look_up_cached_data(job) @@ -864,11 +864,11 @@ async def _schedule_jobs(self): the jobs allocate resources on the original drones before being processed but not during scheduling. 2. The job in the job queue are matched to (the copied)resources iteratively. - The actual matching is performed by the `_match_job` method that returns the most - suitable drone unless no drone is compatible with the job's requirements. - If a match was found, the resources requested by the job are allocated on the matched drone. - If no resources remain unallocated after the last job's allocation, - the matching process is ended for this scheduler interval. + The actual matching is performed by the `_match_job` method that returns the + most suitable drone unless no drone is compatible with the job's requirements. + If a match was found, the resources requested by the job are allocated on the + matched drone. If no resources remain unallocated after the last job's + allocation, the matching process is ended for this scheduler interval. 3. After the job matching is finished, the matched jobs are removed from the job queue as the index of a job in the job queue changes once a job with a lower index is removed from the queue. From 6d57b8ff970e95067c80cb4bc9fcf3d6137ad170 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Thu, 10 Dec 2020 20:13:10 +0100 Subject: [PATCH 599/648] changed build system to poetry --- lapis/__init__.py | 3 -- lapis/{job.py => cachingjob.py} | 0 pyproject.toml | 62 ++++++++++++--------------------- setup.py | 25 ------------- 4 files changed, 22 insertions(+), 68 deletions(-) delete mode 100644 lapis/__init__.py rename lapis/{job.py => cachingjob.py} (100%) delete mode 100644 setup.py diff --git a/lapis/__init__.py b/lapis/__init__.py deleted file mode 100644 index ebe7177..0000000 --- a/lapis/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -"""Lapis is an adaptable, performant, and interactive scheduling (Lapis) simulator""" - -__version__ = "0.3.0" diff --git a/lapis/job.py b/lapis/cachingjob.py similarity index 100% rename from lapis/job.py rename to lapis/cachingjob.py diff --git a/pyproject.toml b/pyproject.toml index 888111e..b9afd16 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,47 +1,29 @@ [build-system] -requires = ["flit"] -build-backend = "flit.buildapi" +requires = ["poetry-core>=1.0.0"] +build-backend = "poetry.core.masonry.api" -[tool.flit.metadata] -module = "lapis" -dist-name = "lapis-sim" -author = "Eileen Kuehn, Max Fischer" -author-email = "mainekuehn@gmail.com" -home-page = "https://github.com/MatterMiners/lapis" -description-file = "README.rst" -keywords = "htcondor simulation python cobald tardis opportunistic scheduling scheduler" +[tool.poetry] +name = "lapis.caching" +version = "0.1.0" +description = "LAPIS extension to simulate caching" +authors = ["Eileen Kuehn ", "Max Fischer "] +license = "MIT" +readme = "README.rst" +homepage = "https://matterminers.github.io" +repository = "https://github.com/MatterMiners/lapis.caching" +keywords = ["caching", "simulation", "opportunistic", "scheduling", "scheduler"] classifiers = [ + "Programming Language :: Python :: 3", "License :: OSI Approved :: MIT License", - 'Development Status :: 2 - Pre-Alpha', - 'Intended Audience :: Developers', - 'Intended Audience :: Information Technology', - 'Intended Audience :: Science/Research', - 'Intended Audience :: System Administrators', - 'Topic :: Adaptive Technologies', - 'Topic :: Office/Business :: Scheduling', - 'Topic :: System :: Distributed Computing', - 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.6', - 'Programming Language :: Python :: 3.7' + "Development Status :: 2 - Pre-Alpha", ] - -requires = ["cobald", "usim >= 0.4.3", "click", "classad", "pyparsing > 2.4.1"] - -[tool.flit.metadata.requires-extra] -test = [ - "pytest >=4.3.0", - "flake8", - "flake8-bugbear", - "black; implementation_name=='cpython'", +packages = [ + { include = "lapis" } ] -doc = [ - "sphinx", - "sphinx_rtd_theme", - "sphinxcontrib-contentui", - "sphinx-click", - "change-log", -] -dev = ["pre-commit"] -[tool.flit.metadata.urls] -Documentation = "https://lapis-sim.readthedocs.io/en/latest/" +[tool.poetry.dependencies] +python = "^3.6.1" +lapis-sim = "^0.4.1" + +[tool.poetry.dev-dependencies] diff --git a/setup.py b/setup.py deleted file mode 100644 index e2533d8..0000000 --- a/setup.py +++ /dev/null @@ -1,25 +0,0 @@ -import setuptools - -with open("README.rst", "r") as readme: - long_description = readme.read() - -setuptools.setup( - name="lapis.caching", - version="0.0.1", - author="Eileen Kuehn, Max Fischer", - author_email="mainekuehn@gmail.com", - description="LAPIS extension to simulate caching", - long_description=long_description, - url="https://github.com/MatterMiners/lapis.caching", - keywords="caching simulation opportunistic scheduling scheduler", - # Even though `lapis` is not a namespace package, declaring `lapis.caching` - # as one allows to drop `.caching` into `.lapis`. The result works as one. - packages=setuptools.find_namespace_packages(include=["lapis.*"]), - classifiers=[ - "Programming Language :: Python :: 3", - "License :: OSI Approved :: MIT License", - "Development Status :: 2 - Pre-Alpha", - ], - install_requires=["lapis-sim"], - python_requires=">=3.6", -) From 8b1799f35040b384821b196fb4966fd84a5311af Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Thu, 10 Dec 2020 20:24:30 +0100 Subject: [PATCH 600/648] renamed job to cachingjob --- lapis/caching/connection.py | 2 +- lapis/cachingjob.py | 3 ++- lapis/drone.py | 8 ++++---- lapis/job_io/htcondor.py | 7 ++++--- lapis/job_io/swf.py | 4 ++-- lapis/monitor/general.py | 6 +++--- lapis/scheduler.py | 16 +++++++++------- lapis/scheduler_old.py | 2 +- lapis/simulator.py | 2 +- lapis_tests/__init__.py | 4 ++-- lapis_tests/test_job.py | 34 +++++++++++++++++---------------- lapis_tests/test_job_caching.py | 18 ++++++++--------- 12 files changed, 56 insertions(+), 50 deletions(-) diff --git a/lapis/caching/connection.py b/lapis/caching/connection.py index 6324b1b..4c2ef2d 100644 --- a/lapis/caching/connection.py +++ b/lapis/caching/connection.py @@ -156,7 +156,7 @@ async def stream_file( await potential_cache.add(requested_file, job_repr) else: print( - f"APPLY CACHING DECISION: Job {job_repr}, " + f"APPLY CACHING DECISION: CachingJob {job_repr}, " f"File {requested_file.filename}: File wasnt " f"cached @ {time.now}" ) diff --git a/lapis/cachingjob.py b/lapis/cachingjob.py index 47a31a7..0421273 100644 --- a/lapis/cachingjob.py +++ b/lapis/cachingjob.py @@ -4,12 +4,13 @@ from usim import CancelTask from lapis.monitor import sampling_required +from lapis.job import Job if TYPE_CHECKING: from lapis.drone import Drone -class Job(object): +class CachingJob(Job): """ Objects of this class represent jobs. The job is described from the batch system's viewpoint by the following attributes: diff --git a/lapis/drone.py b/lapis/drone.py index eca576f..53c4d33 100644 --- a/lapis/drone.py +++ b/lapis/drone.py @@ -3,7 +3,7 @@ from usim import time, Scope, instant, Capacities, ResourcesUnavailable, Queue from typing import Optional -from lapis.job import Job +from lapis.cachingjob import CachingJob from lapis.caching.connection import Connection from lapis.monitor.duplicates import DroneStatusCaching @@ -205,7 +205,7 @@ async def shutdown(self): await (time + 1) - async def schedule_job(self, job: Job, kill: bool = False): + async def schedule_job(self, job: CachingJob, kill: bool = False): """ A job is scheduled to a drone by putting it in the drone's job queue. @@ -215,7 +215,7 @@ async def schedule_job(self, job: Job, kill: bool = False): """ await self._job_queue.put((job, kill)) - async def _run_job(self, job: Job, kill: bool): + async def _run_job(self, job: CachingJob, kill: bool): """ Method manages to start a job in the context of the given drone. The job is started regardless of the available resources. The resource @@ -291,7 +291,7 @@ async def _run_job(self, job: Job, kill: bool): ) ) - def look_up_cached_data(self, job: Job): + def look_up_cached_data(self, job: CachingJob): """ Determines the amount of the job's input data that is stored in caches the drone can access and sets the drone's `cached_data` attribute to the diff --git a/lapis/job_io/htcondor.py b/lapis/job_io/htcondor.py index d57920b..d2bcad9 100644 --- a/lapis/job_io/htcondor.py +++ b/lapis/job_io/htcondor.py @@ -3,7 +3,7 @@ import logging from typing import Optional -from lapis.job import Job +from lapis.cachingjob import CachingJob from copy import deepcopy @@ -42,7 +42,8 @@ def htcondor_job_reader( htcondor_reader = csv.DictReader(iterable, delimiter=" ", quotechar="'") else: logging.getLogger("implementation").error( - "Invalid input file %s. Job input file can not be read." % iterable.name + "Invalid input file %s. CachingJob input file can not be read." + % iterable.name ) for entry in htcondor_reader: if float(entry[used_resource_name_mapping["walltime"]]) <= 0: @@ -107,7 +108,7 @@ def htcondor_job_reader( except KeyError: pass - yield Job( + yield CachingJob( resources=resources, used_resources=used_resources, queue_date=float(entry[used_resource_name_mapping["queuetime"]]), diff --git a/lapis/job_io/swf.py b/lapis/job_io/swf.py index bc75d20..e39a2ab 100644 --- a/lapis/job_io/swf.py +++ b/lapis/job_io/swf.py @@ -6,7 +6,7 @@ import csv from typing import Optional -from lapis.job import Job +from lapis.cachingjob import CachingJob def swf_job_reader( @@ -87,7 +87,7 @@ def swf_job_reader( ) * unit_conversion_mapping.get(used_resource_name_mapping[key], 1) ) - yield Job( + yield CachingJob( resources=resources, used_resources=used_resources, queue_date=float(row[header[used_resource_name_mapping["queuetime"]]]), diff --git a/lapis/monitor/general.py b/lapis/monitor/general.py index c9e1d13..4fb297e 100644 --- a/lapis/monitor/general.py +++ b/lapis/monitor/general.py @@ -6,7 +6,7 @@ from cobald.monitor.format_line import LineProtocolFormatter from lapis.drone import Drone -from lapis.job import Job +from lapis.cachingjob import CachingJob from lapis.monitor import LoggingSocketHandler, LoggingUDPSocketHandler from lapis.pool import Pool from lapis.scheduler import CondorJobScheduler, JobQueue @@ -135,7 +135,7 @@ def job_statistics(scheduler: CondorJobScheduler) -> List[Dict]: } -def job_events(job: Job) -> List[Dict]: +def job_events(job: CachingJob) -> List[Dict]: """ Log relevant events for jobs. Relevant events are @@ -203,7 +203,7 @@ def job_events(job: Job) -> List[Dict]: job_events.name = "job_event" -job_events.whitelist = (Job,) +job_events.whitelist = (CachingJob,) job_events.logging_formatter = { LoggingSocketHandler.__name__: JsonFormatter(), # logging.StreamHandler.__name__: JsonFormatter(), diff --git a/lapis/scheduler.py b/lapis/scheduler.py index f555a7b..cc49d97 100644 --- a/lapis/scheduler.py +++ b/lapis/scheduler.py @@ -13,7 +13,7 @@ from usim import Scope, interval, Resources from lapis.drone import Drone -from lapis.job import Job +from lapis.cachingjob import CachingJob from lapis.monitor import sampling_required from lapis.monitor.duplicates import UserDemand @@ -42,7 +42,7 @@ class JobQueue(list): """ T = TypeVar("T") -DJ = TypeVar("DJ", Drone, Job) +DJ = TypeVar("DJ", Drone, CachingJob) class WrappedClassAd(ClassAd, Generic[DJ]): @@ -448,7 +448,7 @@ def cluster_groups(self) -> Iterator[List[Set[WrappedClassAd[Drone]]]]: raise NotImplementedError @abstractmethod - def lookup(self, job: Job) -> None: + def lookup(self, job: CachingJob) -> None: """Update information about cached data for every drone""" raise NotImplementedError @@ -574,7 +574,7 @@ def cluster_groups(self) -> Iterator[List[Set[WrappedClassAd[Drone]]]]: if group: yield group - def lookup(self, job: Job): + def lookup(self, job: CachingJob): for _, drones in self._clusters.items(): for drone in drones: drone._wrapped.look_up_cached_data(job) @@ -650,7 +650,7 @@ def cluster_groups(self) -> Iterator[List[Set[WrappedClassAd[Drone]]]]: for _ranked_key, drones in self._clusters.items(): yield [{item} for item in drones] - def lookup(self, job: Job): + def lookup(self, job: CachingJob): for _, drones in self._clusters.items(): for drone in drones: drone._wrapped.look_up_cached_data(job) @@ -875,12 +875,14 @@ async def _schedule_jobs(self): 4. The matched jobs' execution is triggered. """ - # Pre Job Rank is the same for all jobs + # Pre CachingJob Rank is the same for all jobs # Use a copy to allow temporary "remainder after match" estimates if self._drones.empty(): return pre_job_drones = self._drones.copy() - matches: List[Tuple[int, WrappedClassAd[Job], WrappedClassAd[Drone]]] = [] + matches: List[ + Tuple[int, WrappedClassAd[CachingJob], WrappedClassAd[Drone]] + ] = [] for queue_index, candidate_job in enumerate(self.job_queue): try: pre_job_drones.lookup(candidate_job._wrapped) diff --git a/lapis/scheduler_old.py b/lapis/scheduler_old.py index 22bbe60..baab225 100644 --- a/lapis/scheduler_old.py +++ b/lapis/scheduler_old.py @@ -138,7 +138,7 @@ def _schedule_job(self, job) -> Drone: cost = 0 resources = drone.theoretical_available_resources # print( - # "trying to match Job {} to {}, resources {}".format( + # "trying to match CachingJob {} to {}, resources {}".format( # repr(job), repr(drone), resources # ) # ) diff --git a/lapis/simulator.py b/lapis/simulator.py index 8037dd3..2cb7f2b 100644 --- a/lapis/simulator.py +++ b/lapis/simulator.py @@ -10,7 +10,7 @@ import lapis.monitor as monitor from lapis.drone import Drone -from lapis.job import job_to_queue_scheduler +from lapis.cachingjob import job_to_queue_scheduler from lapis.caching.connection import Connection from lapis.monitor.caching import ( storage_status, diff --git a/lapis_tests/__init__.py b/lapis_tests/__init__.py index 717dd2c..0de4e8b 100644 --- a/lapis_tests/__init__.py +++ b/lapis_tests/__init__.py @@ -4,7 +4,7 @@ from usim import run, Resources from lapis.drone import Drone -from lapis.job import Job +from lapis.cachingjob import CachingJob from lapis.caching.connection import Connection @@ -64,7 +64,7 @@ def unregister_drone(drone: Drone): def update_drone(drone: Drone): pass - async def job_finished(self, job: Job): + async def job_finished(self, job: CachingJob): if job.successful: await self.statistics.increase(job_succeeded=1) else: diff --git a/lapis_tests/test_job.py b/lapis_tests/test_job.py index dd985ce..2ccc2a8 100644 --- a/lapis_tests/test_job.py +++ b/lapis_tests/test_job.py @@ -2,7 +2,7 @@ from usim import Scope, time from lapis.drone import Drone -from lapis.job import Job +from lapis.cachingjob import CachingJob from lapis_tests import via_usim, DummyScheduler, DummyDrone from lapis.caching.connection import Connection @@ -11,26 +11,28 @@ class TestJob(object): def test_init(self): with pytest.raises(KeyError): - Job(resources={}, used_resources={}) + CachingJob(resources={}, used_resources={}) with pytest.raises(KeyError): - Job(resources={"walltime": 100}, used_resources={}) - assert Job(resources={}, used_resources={"walltime": 100}) + CachingJob(resources={"walltime": 100}, used_resources={}) + assert CachingJob(resources={}, used_resources={"walltime": 100}) with pytest.raises(AssertionError): - Job(resources={}, used_resources={"walltime": 100}, in_queue_since=-5) + CachingJob( + resources={}, used_resources={"walltime": 100}, in_queue_since=-5 + ) def test_name(self): name = "test" - job = Job(resources={}, used_resources={"walltime": 100}, name=name) + job = CachingJob(resources={}, used_resources={"walltime": 100}, name=name) assert job.name == name - assert repr(job) == "" % name - job = Job(resources={}, used_resources={"walltime": 100}) + assert repr(job) == "" % name + job = CachingJob(resources={}, used_resources={"walltime": 100}) assert job.name == id(job) - assert repr(job) == "" % id(job) + assert repr(job) == "" % id(job) @via_usim async def test_run_job(self): drone = DummyDrone() - job = Job(resources={"walltime": 50}, used_resources={"walltime": 10}) + job = CachingJob(resources={"walltime": 50}, used_resources={"walltime": 10}) assert float("inf") == job.waiting_time async with Scope() as scope: scope.do(job.run(drone)) @@ -41,7 +43,7 @@ async def test_run_job(self): @via_usim async def test_job_in_drone(self): scheduler = DummyScheduler() - job = Job( + job = CachingJob( resources={"walltime": 50, "cores": 1, "memory": 1}, used_resources={"walltime": 10, "cores": 1, "memory": 1}, ) @@ -65,7 +67,7 @@ async def test_job_in_drone(self): @via_usim async def test_nonmatching_job_in_drone(self): scheduler = DummyScheduler() - job = Job( + job = CachingJob( resources={"walltime": 50, "cores": 2, "memory": 1}, used_resources={"walltime": 10, "cores": 1, "memory": 1}, ) @@ -88,11 +90,11 @@ async def test_nonmatching_job_in_drone(self): @via_usim async def test_two_nonmatching_jobs(self): scheduler = DummyScheduler() - job_one = Job( + job_one = CachingJob( resources={"walltime": 50, "cores": 1, "memory": 1}, used_resources={"walltime": 10, "cores": 1, "memory": 1}, ) - job_two = Job( + job_two = CachingJob( resources={"walltime": 50, "cores": 1, "memory": 1}, used_resources={"walltime": 10, "cores": 1, "memory": 1}, ) @@ -118,11 +120,11 @@ async def test_two_nonmatching_jobs(self): @via_usim async def test_two_matching_jobs(self): scheduler = DummyScheduler() - job_one = Job( + job_one = CachingJob( resources={"walltime": 50, "cores": 1, "memory": 1}, used_resources={"walltime": 10, "cores": 1, "memory": 1}, ) - job_two = Job( + job_two = CachingJob( resources={"walltime": 50, "cores": 1, "memory": 1}, used_resources={"walltime": 10, "cores": 1, "memory": 1}, ) diff --git a/lapis_tests/test_job_caching.py b/lapis_tests/test_job_caching.py index 95bf78e..04e59e1 100644 --- a/lapis_tests/test_job_caching.py +++ b/lapis_tests/test_job_caching.py @@ -1,6 +1,6 @@ from usim import time -from lapis.job import Job +from lapis.cachingjob import CachingJob from lapis_tests import via_usim, DummyDrone @@ -8,7 +8,7 @@ class TestJobCaching(object): @via_usim async def test_calculation_time(self): - self.job = Job( + self.job = CachingJob( resources={"walltime": 60}, used_resources={"walltime": 10, "cores": 0.7} ) self.job.drone = DummyDrone(1) @@ -16,7 +16,7 @@ async def test_calculation_time(self): await self.job._calculate() assert time.now - starttime == 10 - self.job = Job( + self.job = CachingJob( resources={"walltime": 60, "inputfiles": {"file"}}, used_resources={"walltime": 10, "cores": 0.7}, ) @@ -25,7 +25,7 @@ async def test_calculation_time(self): await self.job._calculate() assert time.now - starttime == 7 - self.job = Job( + self.job = CachingJob( resources={"walltime": 60, "inputfiles": {"file"}}, used_resources={"walltime": 10, "cores": 0.7}, calculation_efficiency=0.5, @@ -35,7 +35,7 @@ async def test_calculation_time(self): await self.job._calculate() assert time.now - starttime == 14 - self.job = Job( + self.job = CachingJob( resources={"walltime": 60, "inputfiles": {"file"}}, used_resources={"walltime": 10}, calculation_efficiency=0.5, @@ -49,7 +49,7 @@ async def test_calculation_time(self): async def test_transfer_time(self): conversion_GB_to_B = 1000 * 1000 * 1000 drone = DummyDrone(1) - self.job = Job( + self.job = CachingJob( resources={ "walltime": 60, "inputfiles": {"file": {"usedsize": 20 * conversion_GB_to_B}}, @@ -68,7 +68,7 @@ async def test_transfer_time(self): await self.job._transfer_inputfiles() assert time.now - starttime == 20 - self.job = Job( + self.job = CachingJob( resources={"walltime": 60}, used_resources={"walltime": 10}, calculation_efficiency=1.0, @@ -79,7 +79,7 @@ async def test_transfer_time(self): await self.job._transfer_inputfiles() assert time.now - starttime == 0 - self.job = Job( + self.job = CachingJob( resources={ "walltime": 60, "inputfiles": {"file": {"usedsize": 20 * conversion_GB_to_B}}, @@ -93,7 +93,7 @@ async def test_transfer_time(self): await self.job._transfer_inputfiles() assert time.now - starttime == 0 - self.job = Job( + self.job = CachingJob( resources={ "walltime": 60, "inputfiles": {"file": {"usedsize": 20 * conversion_GB_to_B}}, From 8e9073a689ad0bd67c3c14acc3e1a517f0e33387 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Fri, 11 Dec 2020 09:32:34 +0100 Subject: [PATCH 601/648] fixed function that was still defined async --- lapis/caching/connection.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lapis/caching/connection.py b/lapis/caching/connection.py index 4c2ef2d..a66accf 100644 --- a/lapis/caching/connection.py +++ b/lapis/caching/connection.py @@ -92,7 +92,7 @@ def add_storage_element(self, storage_element: StorageElement): except KeyError: self.storages[storage_element.sitename] = [storage_element] - async def _determine_inputfile_source( + def _determine_inputfile_source( self, requested_file: RequestedFile, dronesite: Optional[str], @@ -138,7 +138,7 @@ async def stream_file( :param dronesite: :param job_repr: """ - used_connection = await self._determine_inputfile_source( + used_connection = self._determine_inputfile_source( requested_file, dronesite, job_repr ) if self._filebased_caching: From 9a1dc7b50af23a1b35d2d2cd22a6029f3717d1ff Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Fri, 11 Dec 2020 09:33:24 +0100 Subject: [PATCH 602/648] added hitrates to test data for unit tests --- lapis_tests/data/job_list_minimal.json | 44 ++++++++++++++++---------- 1 file changed, 28 insertions(+), 16 deletions(-) diff --git a/lapis_tests/data/job_list_minimal.json b/lapis_tests/data/job_list_minimal.json index fc31d03..65d2377 100644 --- a/lapis_tests/data/job_list_minimal.json +++ b/lapis_tests/data/job_list_minimal.json @@ -15,15 +15,18 @@ "Inputfiles": { "a.root": { "filesize": 25000, - "usedsize": 20000 + "usedsize": 20000, + "hitrates": {"x": 1} }, "b.root": { "filesize": 25000, - "usedsize": 20000 + "usedsize": 20000, + "hitrates": {"x": 1} }, "c.root": { - "filesize": 25000, - "usedsize": 20000 + "filesize": 25000, + "usedsize": 20000, + "hitrates": {"x": 1} } } }, @@ -43,15 +46,18 @@ "Inputfiles": { "a.root": { "filesize": 25000, - "usedsize": 20000 + "usedsize": 20000, + "hitrates": {"x": 1} }, "b.root": { "filesize": 25000, - "usedsize": 20000 + "usedsize": 20000, + "hitrates": {"x": 1} }, "c.root": { - "filesize": 25000, - "usedsize": 20000 + "filesize": 25000, + "usedsize": 20000, + "hitrates": {"x": 1} } } }, @@ -70,15 +76,18 @@ "Inputfiles": { "a.root": { "filesize": 25000, - "usedsize": 20000 + "usedsize": 20000, + "hitrates": {"x": 1} }, "b.root": { "filesize": 25000, - "usedsize": 20000 + "usedsize": 20000, + "hitrates": {"x": 1} }, "c.root": { - "filesize": 25000, - "usedsize": 20000 + "filesize": 25000, + "usedsize": 20000, + "hitrates": {"x": 1} } } }, @@ -98,15 +107,18 @@ "Inputfiles": { "a.root": { "filesize": 25000, - "usedsize": 20000 + "usedsize": 20000, + "hitrates": {"x": 1} }, "b.root": { "filesize": 25000, - "usedsize": 20000 + "usedsize": 20000, + "hitrates": {"x": 1} }, "c.root": { - "filesize": 25000, - "usedsize": 20000 + "filesize": 25000, + "usedsize": 20000, + "hitrates": {"x": 1} } } } From fb02239178df78397cd78d752f7f05791372b32f Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Fri, 11 Dec 2020 09:33:48 +0100 Subject: [PATCH 603/648] adapted some unit tests --- lapis_tests/test_caching_hitrate_based.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/lapis_tests/test_caching_hitrate_based.py b/lapis_tests/test_caching_hitrate_based.py index dea8bd6..ace0d5b 100644 --- a/lapis_tests/test_caching_hitrate_based.py +++ b/lapis_tests/test_caching_hitrate_based.py @@ -28,8 +28,7 @@ def test_hitratestorage(self): assert 100 == looked_up_file.cached_filesize assert hitratestorage == looked_up_file.storage - @via_usim - async def test_add_storage_to_connection(self): + def test_add_storage_to_connection(self): throughput = 10 size = 1000 hitratestorage = HitrateStorage(hitrate=0.5, size=size, files={}) @@ -37,15 +36,14 @@ async def test_add_storage_to_connection(self): connection.add_storage_element(hitratestorage) assert hitratestorage in connection.storages[hitratestorage.sitename] - @via_usim - async def test_determine_inputfile_source(self): + def test_determine_inputfile_source(self): throughput = 10 size = 1000 requested_file = RequestedFile(filename="testfile", filesize=100) hitratestorage = HitrateStorage(hitrate=0.5, size=size, files={}) connection = Connection(throughput=throughput) connection.add_storage_element(hitratestorage) - cache = await connection._determine_inputfile_source( + cache = connection._determine_inputfile_source( requested_file=requested_file, dronesite=None ) assert cache is hitratestorage @@ -110,8 +108,7 @@ async def test_simultaneous_transfer(self): assert time.now == 15 assert stream_time == 15 - @via_usim - async def test_full_simulation_with_hitratebased_caching(self): + def test_full_simulation_with_hitratebased_caching(self): with NamedTemporaryFile(suffix=".csv") as machine_config, NamedTemporaryFile( suffix=".csv" ) as storage_config, NamedTemporaryFile(suffix=".json") as job_config: @@ -183,3 +180,7 @@ async def test_full_simulation_with_hitratebased_caching(self): simulator.enable_monitoring() simulator.run() assert 180 == simulator.duration + + job_input.close() + storage_input.close() + machine_input.close() From ee0c192cfc9c38632befdc632c676e826dddf4ec Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Fri, 11 Dec 2020 10:03:50 +0100 Subject: [PATCH 604/648] starting to adapt documentation --- docs/conf.py | 15 +- docs/source/changes/10.logging_extension.yaml | 12 -- .../changes/19.distribution_and_license.yaml | 11 -- .../changes/23.usim_api_adaptations.yaml | 9 -- .../changes/25.stopping_job_generator.yaml | 8 -- .../changes/26.terminate_simulation.yaml | 12 -- docs/source/changes/27.documentation.yaml | 8 -- docs/source/changes/28.cleanup.yaml | 8 -- docs/source/changes/29.resource_ratio.yaml | 10 -- docs/source/changes/3.flake8.yaml | 8 -- docs/source/changes/36.project_renamed.yaml | 8 -- .../changes/37.usim_api_adaptation.yaml | 8 -- .../changes/44.terminate_simulation.yaml | 13 -- .../changes/45.logging_improvements.yaml | 14 -- ...46.resource_usage_in_scheduling_cycle.yaml | 12 -- docs/source/changes/47.pre_commit_hooks.yaml | 8 -- .../49.htcondor_import_corrections.yaml | 7 - docs/source/changes/50.black_usage.yaml | 7 - docs/source/changes/51.job_input_files.yaml | 7 - .../changes/59.unit_standardisation.yaml | 7 - .../changes/6.swf_import_corrections.yaml | 13 -- docs/source/changes/63.execution_of_jobs.yaml | 11 -- .../changes/66.terminate_simulation.yaml | 9 -- .../changes/69.job_drone_requirement.yaml | 7 - docs/source/changes/74.changelog.yaml | 6 - docs/source/changes/76.register_drones.yaml | 7 - docs/source/changes/79.jobqueue_removal.yaml | 8 -- docs/source/changes/80.job_cancelation.yaml | 8 -- docs/source/changes/versions.yaml | 7 +- docs/source/topics/autodoc.rst | 15 +- docs/source/topics/concept.rst | 136 ------------------ docs/source/topics/overview.rst | 2 - docs/source/topics/support.rst | 47 ------ 33 files changed, 17 insertions(+), 451 deletions(-) delete mode 100644 docs/source/changes/10.logging_extension.yaml delete mode 100644 docs/source/changes/19.distribution_and_license.yaml delete mode 100644 docs/source/changes/23.usim_api_adaptations.yaml delete mode 100644 docs/source/changes/25.stopping_job_generator.yaml delete mode 100644 docs/source/changes/26.terminate_simulation.yaml delete mode 100644 docs/source/changes/27.documentation.yaml delete mode 100644 docs/source/changes/28.cleanup.yaml delete mode 100644 docs/source/changes/29.resource_ratio.yaml delete mode 100644 docs/source/changes/3.flake8.yaml delete mode 100644 docs/source/changes/36.project_renamed.yaml delete mode 100644 docs/source/changes/37.usim_api_adaptation.yaml delete mode 100644 docs/source/changes/44.terminate_simulation.yaml delete mode 100644 docs/source/changes/45.logging_improvements.yaml delete mode 100644 docs/source/changes/46.resource_usage_in_scheduling_cycle.yaml delete mode 100644 docs/source/changes/47.pre_commit_hooks.yaml delete mode 100644 docs/source/changes/49.htcondor_import_corrections.yaml delete mode 100644 docs/source/changes/50.black_usage.yaml delete mode 100644 docs/source/changes/51.job_input_files.yaml delete mode 100644 docs/source/changes/59.unit_standardisation.yaml delete mode 100644 docs/source/changes/6.swf_import_corrections.yaml delete mode 100644 docs/source/changes/63.execution_of_jobs.yaml delete mode 100644 docs/source/changes/66.terminate_simulation.yaml delete mode 100644 docs/source/changes/69.job_drone_requirement.yaml delete mode 100644 docs/source/changes/74.changelog.yaml delete mode 100644 docs/source/changes/76.register_drones.yaml delete mode 100644 docs/source/changes/79.jobqueue_removal.yaml delete mode 100644 docs/source/changes/80.job_cancelation.yaml delete mode 100644 docs/source/topics/concept.rst delete mode 100644 docs/source/topics/support.rst diff --git a/docs/conf.py b/docs/conf.py index 53089a9..b783cb9 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -24,9 +24,9 @@ # -- Project information ----------------------------------------------------- -project = "lapis" +project = "lapis.caching" author = "Eileen Kuehn, Max Fischer" -copyright = f"2019 {author}" +copyright = f"2019-2020 {author}" # The short X.Y version version = lapis.__version__ @@ -149,7 +149,7 @@ ( master_doc, "lapis.tex", - "lapis Documentation", + "lapis.caching Documentation", "Eileen Kuehn, Max Fischer", "manual", ) @@ -160,7 +160,7 @@ # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). -man_pages = [(master_doc, "lapis", "lapis Documentation", [author], 1)] +man_pages = [(master_doc, project, "lapis.caching Documentation", [author], 1)] # -- Options for Texinfo output ---------------------------------------------- @@ -171,8 +171,8 @@ texinfo_documents = [ ( master_doc, - "lapis", - "lapis Documentation", + project, + "lapis.caching Documentation", author, "lapis", "One line description of project.", @@ -206,8 +206,7 @@ # Example configuration for intersphinx: refer to the Python standard library. intersphinx_mapping = { "python": ("https://docs.python.org/3", None), - "usim": ("https://usim.readthedocs.io/en/stable", None), - "cobald": ("https://cobald.readthedocs.io/en/stable", None), + "lapis": ("https://lapis-sim.readthedocs.io/en/stable", None), } # -- Options for todo extension ---------------------------------------------- diff --git a/docs/source/changes/10.logging_extension.yaml b/docs/source/changes/10.logging_extension.yaml deleted file mode 100644 index 29c94ed..0000000 --- a/docs/source/changes/10.logging_extension.yaml +++ /dev/null @@ -1,12 +0,0 @@ -category: changed -summary: "Extension of logging" -description: | - The logging of predefined logging functions now follows a specified database - structure. The structure is documented in the documentation of the package. - Further, the logging was extended to additionally support logging to telegraf - using the python default UDP logging port and the LineProtocolFormat of telegraf. - The timestamps of the log messages follow the simulation time. -pull requests: - - 10 - - 14 -version: 0.1.1 diff --git a/docs/source/changes/19.distribution_and_license.yaml b/docs/source/changes/19.distribution_and_license.yaml deleted file mode 100644 index 5f3d9eb..0000000 --- a/docs/source/changes/19.distribution_and_license.yaml +++ /dev/null @@ -1,11 +0,0 @@ -category: changed -summary: "Distribution setup and license information" -description: | - Usim is a new requirement for installing the package. Further, the distribution - process now uses flit and, therefore, setup.py was replaced by pyproject.toml. - Finally, the license file was adapted to also include Max as an author. -pull requests: - - 19 - - 33 - - 30 -version: 0.1.1 diff --git a/docs/source/changes/23.usim_api_adaptations.yaml b/docs/source/changes/23.usim_api_adaptations.yaml deleted file mode 100644 index f1c212e..0000000 --- a/docs/source/changes/23.usim_api_adaptations.yaml +++ /dev/null @@ -1,9 +0,0 @@ -category: changed -summary: "Support of current API of usim" -description: | - In preparation to support the usim features of borrowing and claiming resources - guaranteeing synchronisation of available resources in drones, the implementation was - adapted to meet the current requireements of usim v0.3. -pull requests: - - 23 -version: 0.1.1 diff --git a/docs/source/changes/25.stopping_job_generator.yaml b/docs/source/changes/25.stopping_job_generator.yaml deleted file mode 100644 index a7d5608..0000000 --- a/docs/source/changes/25.stopping_job_generator.yaml +++ /dev/null @@ -1,8 +0,0 @@ -category: fixed -summary: "StopIteration handling by Job Generator" -description: | - The Job Generator so far did not properly handle StopIterations while importing - jobs. This has been fixed. -pull requests: - - 25 -version: 0.1.1 diff --git a/docs/source/changes/26.terminate_simulation.yaml b/docs/source/changes/26.terminate_simulation.yaml deleted file mode 100644 index 4ca1327..0000000 --- a/docs/source/changes/26.terminate_simulation.yaml +++ /dev/null @@ -1,12 +0,0 @@ -category: fixed -summary: "Termination of simulation" -description: | - The scheduler so far did not have any information about when the simulation - was expected to terminate. Therefore, a new property `_collecting` was introduced - in the simlulator. This property is bound to the job queue. As soon as the - job generator does not produce any new jobs the job queue is closed and after - all jobs were removed, the property `_collecting` is set to `False` to - initiate termination of simulation. -pull requests: - - 26 -version: 0.1.1 diff --git a/docs/source/changes/27.documentation.yaml b/docs/source/changes/27.documentation.yaml deleted file mode 100644 index fb48fb5..0000000 --- a/docs/source/changes/27.documentation.yaml +++ /dev/null @@ -1,8 +0,0 @@ -category: added -summary: "Basic documentation" -description: | - LAPIS now includes a basic documentation about the different components and - concepts, importing jobs and pools, the logging process and database structure - as well as the command line interface. -pull requests: - - 27 diff --git a/docs/source/changes/28.cleanup.yaml b/docs/source/changes/28.cleanup.yaml deleted file mode 100644 index 7611c72..0000000 --- a/docs/source/changes/28.cleanup.yaml +++ /dev/null @@ -1,8 +0,0 @@ -category: changed -summary: "Cleanup and improvements of existing code" -description: | - usim related code was simplified and unused code was removed. -pull requests: - - 28 - - 32 -version: 0.1.1 diff --git a/docs/source/changes/29.resource_ratio.yaml b/docs/source/changes/29.resource_ratio.yaml deleted file mode 100644 index 8b394dc..0000000 --- a/docs/source/changes/29.resource_ratio.yaml +++ /dev/null @@ -1,10 +0,0 @@ -category: fixed -summary: "Calculation of used and requested resource ratio" -description: | - The introduction of the borrowing and claiming concept of resources provided - by usim changed the way resources were handled internally. Therefore, the - calculation of used and requested resource ratios had to be adapted to not - result in wrong results. -pull requests: - - 29 -version: 0.1.1 diff --git a/docs/source/changes/3.flake8.yaml b/docs/source/changes/3.flake8.yaml deleted file mode 100644 index a2a16fa..0000000 --- a/docs/source/changes/3.flake8.yaml +++ /dev/null @@ -1,8 +0,0 @@ -category: added -summary: "Requirement for flake8" -description: | - Flake8 was added as a requirement for continuous integration. Further, all - issues with flake8 were fixed. -pull requests: - - 3 -version: 0.1.1 diff --git a/docs/source/changes/36.project_renamed.yaml b/docs/source/changes/36.project_renamed.yaml deleted file mode 100644 index 1684f8c..0000000 --- a/docs/source/changes/36.project_renamed.yaml +++ /dev/null @@ -1,8 +0,0 @@ -category: changed -summary: "Rename from lapis to lapis-sim for pypi and rtd" -description: | - The name *lapis* in pypi was already taken so we had to change the distribution - name and decided to go for *lapis-sim*. -pull requests: - - 36 -version: 0.2.0 diff --git a/docs/source/changes/37.usim_api_adaptation.yaml b/docs/source/changes/37.usim_api_adaptation.yaml deleted file mode 100644 index fa65b4b..0000000 --- a/docs/source/changes/37.usim_api_adaptation.yaml +++ /dev/null @@ -1,8 +0,0 @@ -category: changed -summary: "Support of current API of usim" -description: | - In preparation to support upcomming features of usim, the current usage had to - be adapted to meet the current requireements of usim v0.4. -pull requests: - - 37 -version: 0.2.0 diff --git a/docs/source/changes/44.terminate_simulation.yaml b/docs/source/changes/44.terminate_simulation.yaml deleted file mode 100644 index 1f60138..0000000 --- a/docs/source/changes/44.terminate_simulation.yaml +++ /dev/null @@ -1,13 +0,0 @@ -category: fixed -summary: "Proper termination of simulation" -description: | - usim defines non-volatile and volatile running of async tasks within scopes. - Volatile tasks can be finished by the outer scope when the outer scope - terminates while non-volatile tasks must be finished before leaving the scope. - So far, all tasks were run as non-volatile in lapis resulting in simulations - that never finished as ``pools``, and ``controllers`` for example were running - forever. This is fixed now by starting those tasks as volatile when running - a simulation. -pull requests: - - 44 -version: 0.3.0 diff --git a/docs/source/changes/45.logging_improvements.yaml b/docs/source/changes/45.logging_improvements.yaml deleted file mode 100644 index e35f12b..0000000 --- a/docs/source/changes/45.logging_improvements.yaml +++ /dev/null @@ -1,14 +0,0 @@ -category: changed -summary: "Object-based logging and logging for job events" -description: | - So far the logging during simulation was flag-based. A soon as the logging flag - was set to ``True``, the logging process started. This created a lot of overhead. - The logging now is object-based, meaning that only objects with relevant changes - for logging are added to a logging queue. Each of those objects in the logging - queue are processed by registered logging functions. - - Further, now also the logging of job events is supported through this global - logging process. -pull requests: - - 45 -version: 0.3.0 diff --git a/docs/source/changes/46.resource_usage_in_scheduling_cycle.yaml b/docs/source/changes/46.resource_usage_in_scheduling_cycle.yaml deleted file mode 100644 index 2103374..0000000 --- a/docs/source/changes/46.resource_usage_in_scheduling_cycle.yaml +++ /dev/null @@ -1,12 +0,0 @@ -category: fixed -summary: "Update of available resources during scheduling cycle" -description: | - Until now jobs took care on updating available resources after a job was - started resulting in an adaption of the auto clustering in the scheduler. - As the starting of jobs took longer than the scheduling within one scheduling - cycle another job could be assigned although the resources were gone already. - This is fixed now by temporarily assuming resource allocation after a job was - sent to a drone within the scheduler itself. -pull requests: - - 46 -version: 0.3.0 diff --git a/docs/source/changes/47.pre_commit_hooks.yaml b/docs/source/changes/47.pre_commit_hooks.yaml deleted file mode 100644 index 42c3eff..0000000 --- a/docs/source/changes/47.pre_commit_hooks.yaml +++ /dev/null @@ -1,8 +0,0 @@ -category: added -summary: "Pre-commit hooks" -description: | - LAPIS now defines some pre-commit hooks including the execution of black for - proper formatting of source code. All files have, therefore, also been blackened. -pull requests: - - 47 -version: 0.3.0 diff --git a/docs/source/changes/49.htcondor_import_corrections.yaml b/docs/source/changes/49.htcondor_import_corrections.yaml deleted file mode 100644 index ccce8cd..0000000 --- a/docs/source/changes/49.htcondor_import_corrections.yaml +++ /dev/null @@ -1,7 +0,0 @@ -category: fixed -summary: "Importing of HTCondor jobs" -description: | - The unit conversion for some of the values from HTCondor jobs did not work - properly as values were overwritten. This is fixed now. -pull requests: - - 49 diff --git a/docs/source/changes/50.black_usage.yaml b/docs/source/changes/50.black_usage.yaml deleted file mode 100644 index a766b64..0000000 --- a/docs/source/changes/50.black_usage.yaml +++ /dev/null @@ -1,7 +0,0 @@ -category: fixed -summary: "Handling of black for pypy" -description: | - Black does not work when running CI with pypy. The usage of black with pypy - therefore has been removed now. -pull requests: - - 50 diff --git a/docs/source/changes/51.job_input_files.yaml b/docs/source/changes/51.job_input_files.yaml deleted file mode 100644 index 9df0ef3..0000000 --- a/docs/source/changes/51.job_input_files.yaml +++ /dev/null @@ -1,7 +0,0 @@ -category: added -summary: "Information about input files for jobs" -description: | - In preparation for enabling caching and its effects within the simulation the - support of input files for jobs has been added. -pull requests: - - 51 diff --git a/docs/source/changes/59.unit_standardisation.yaml b/docs/source/changes/59.unit_standardisation.yaml deleted file mode 100644 index 53adaa0..0000000 --- a/docs/source/changes/59.unit_standardisation.yaml +++ /dev/null @@ -1,7 +0,0 @@ -category: changed -summary: "Standardisation of units" -description: | - The usage of units was not standardised so far. We now introduced to solely work - with Bytes and use ``int`` for representation throughout the simulation. -pull requests: - - 59 diff --git a/docs/source/changes/6.swf_import_corrections.yaml b/docs/source/changes/6.swf_import_corrections.yaml deleted file mode 100644 index c6a8d4a..0000000 --- a/docs/source/changes/6.swf_import_corrections.yaml +++ /dev/null @@ -1,13 +0,0 @@ -category: fixed -summary: "Importing of SWF files" -description: | - SWF imports did not properly do the unit conversion but instead accessed a - wrong dictionary to get the conversion information. Now they use the correct - dictionary. - Further, SWF defines values of ``-1`` as a default for user values that have - not been specified. However, negative values of resources in LAPIS are not - supported. Therefore, negative values now default to ``0`` in import process.ß -pull requests: - - 6 - - 22 -version: 0.1.1 diff --git a/docs/source/changes/63.execution_of_jobs.yaml b/docs/source/changes/63.execution_of_jobs.yaml deleted file mode 100644 index 49eb3b8..0000000 --- a/docs/source/changes/63.execution_of_jobs.yaml +++ /dev/null @@ -1,11 +0,0 @@ -category: fixed -summary: "Jobs execution within drones" -description: | - Until now jobs have been started implictly by the scheduler within its scheduling - cycle. This created some issues for asynchronous tasks within jobs as those were - bound to the duration of the scheduling cycle. Therefore, we introduced a new - queue within drones that receive the jobs from the scheduler. Further, the drone - now takes care to properly start the job so that it runs independently now in - the scope of the drone. -pull requests: - - 63 diff --git a/docs/source/changes/66.terminate_simulation.yaml b/docs/source/changes/66.terminate_simulation.yaml deleted file mode 100644 index 32d929c..0000000 --- a/docs/source/changes/66.terminate_simulation.yaml +++ /dev/null @@ -1,9 +0,0 @@ -category: fixed -summary: "Proper termination of simulation" -description: | - The simulation so far only awaited that the job queue became empty. This however - did not ensure that the jobs also properly finished. Therefore, the scheduler - now has a new property to store the number of running jobs. So the termination - of the scheduler is now additionally bound to this counter getting ``0``. -pull requests: - - 66 diff --git a/docs/source/changes/69.job_drone_requirement.yaml b/docs/source/changes/69.job_drone_requirement.yaml deleted file mode 100644 index 26823aa..0000000 --- a/docs/source/changes/69.job_drone_requirement.yaml +++ /dev/null @@ -1,7 +0,0 @@ -category: added -summary: "Drone as a requirement to run a job" -description: | - Until now the run method of jobs did not require any parameter. This is changed - now and relies on a parameter for the executing drone. -pull requests: - - 69 diff --git a/docs/source/changes/74.changelog.yaml b/docs/source/changes/74.changelog.yaml deleted file mode 100644 index 1fa7889..0000000 --- a/docs/source/changes/74.changelog.yaml +++ /dev/null @@ -1,6 +0,0 @@ -category: added -summary: "Changelog" -description: | - The documentation now includes a changelog up to the current version. -pull requests: - - 74 diff --git a/docs/source/changes/76.register_drones.yaml b/docs/source/changes/76.register_drones.yaml deleted file mode 100644 index a0f0c90..0000000 --- a/docs/source/changes/76.register_drones.yaml +++ /dev/null @@ -1,7 +0,0 @@ -category: fixed -summary: "Duplicate registration of drones" -description: | - At startup drones were registered twice at the scheduler as the method - ``register_drone`` was called during initialisation and in ``run``. -pull requests: - - 76 diff --git a/docs/source/changes/79.jobqueue_removal.yaml b/docs/source/changes/79.jobqueue_removal.yaml deleted file mode 100644 index 5dd0cf6..0000000 --- a/docs/source/changes/79.jobqueue_removal.yaml +++ /dev/null @@ -1,8 +0,0 @@ -category: fixed -summary: "Scheduling of jobs" -description: | - During the scheduling cycle the original job queue was used although jobs - could be removed during scheduling. Now scheduling is performed on a copy - of the job queue. -pull requests: - - 79 diff --git a/docs/source/changes/80.job_cancelation.yaml b/docs/source/changes/80.job_cancelation.yaml deleted file mode 100644 index 7738cc4..0000000 --- a/docs/source/changes/80.job_cancelation.yaml +++ /dev/null @@ -1,8 +0,0 @@ -category: fixed -summary: "Cancelation of jobs" -description: | - When a drone tried to cancel a job it could happen that the success state - of that job was not properly set as the job was not yet in running state. - This is fixed now by additionally waiting for an `instant`. -pull requests: - - 80 diff --git a/docs/source/changes/versions.yaml b/docs/source/changes/versions.yaml index 904ba40..c4abffd 100644 --- a/docs/source/changes/versions.yaml +++ b/docs/source/changes/versions.yaml @@ -1,6 +1 @@ -- semver: 0.3.0 - date: '2019-10-27' -- semver: 0.2.0 - date: '2019-10-25' -- semver: 0.1.1 - date: '2019-10-24' +- semver: 0.1.0 diff --git a/docs/source/topics/autodoc.rst b/docs/source/topics/autodoc.rst index 82ea68a..c1603cb 100644 --- a/docs/source/topics/autodoc.rst +++ b/docs/source/topics/autodoc.rst @@ -3,18 +3,21 @@ Detailed documentation of all relevant modules .. automodule:: lapis.connection +.. automodule:: lapis.files + +.. automodule:: lapis.storageelement + +.. automodule:: lapis.monitoredpipe + +Other modules from LAPIS +------------------------ + .. automodule:: lapis.scheduler .. automodule:: lapis.job -.. automodule:: lapis.storageelement - .. automodule:: lapis.drone .. automodule:: lapis.pool -.. automodule:: lapis.files - .. automodule:: lapis.simulator - -.. automodule:: lapis.monitoredpipe diff --git a/docs/source/topics/concept.rst b/docs/source/topics/concept.rst deleted file mode 100644 index 1e17a2d..0000000 --- a/docs/source/topics/concept.rst +++ /dev/null @@ -1,136 +0,0 @@ -Simulation Concept -================== - -Background ----------- - -.. TODO:: - - HEP context. - -Components ----------- - -The core simulation builds on several components, and concepts: - -* :term:`Job Generator`, -* :term:`Job Queue`, -* :term:`Pools ` and their :term:`Controllers `, -* :term:`Drones `, and -* the :term:`Scheduler`, - -If you are planning to adapt the simulation for your specific use case, please -consider the different components to determine what and where to extend functionality. - -Job Generator -~~~~~~~~~~~~~ - -The Job Generator processes any job input files. It takes care to -translate time-based characteristics of the :term:`jobs ` into simulation -time. For this the timestamp of the first :term:`job` of each job input file is -taken as the ``base`` timestamp, resulting in a time value of ``0`` for the -first :term:`job`. All following :term:`jobs ` are adapted accordingly, -i.e. time is ``time - base``. - -The Job Generator itself acts as a generator, meaning that a :term:`job` is put -into the simulations :term:`Job Queue` as soon as the simulation time corresponds -to the translated :term:`job` queueing time. - -Job Queue -~~~~~~~~~ - -The Job Queue is filled with :term:`jobs ` in creation-time order by the -:term:`Job Generator`. The queue is managed by the :term:`scheduler` and contains -all :term:`jobs ` that are not yet scheduled to a :term:`drone` as well as -:term:`jobs ` that have not yet been processed succesfully. - -Pools -~~~~~ - -Pools are created based on the pool input files. Each pool is characterised by -a set of defined resources. Further, pools have a ``capacity`` number of -:term:`drones ` that can be created from a given pool. If the capacity -is not specified, a maximum capacity of ``float("inf")`` is assumed. - -For pools, we differentiate static and dynamic pools. While static pools are -intialised with a fixed amount of :term:`drones `, the number of -:term:`drones ` is adapted dynamically by the -:term:`pool controller ` for dynamic pools. - -.. autoclass:: lapis.pool.Pool -.. autoclass:: lapis.pool.StaticPool - -Controllers -~~~~~~~~~~~ - -Each :term:`pool` is managed by a controller. Each controller runs -periodically to check :term:`allocation` and :term:`utilisation` of assigned -:term:`pool(s) ` to regulate the demand of :term:`drones ` for the -given :term:`pool`. - -The concept of controllers is introduced by COBalD. The controllers implemented -in LAPIS share the general concept as well as implementation by subclassing -provided controllers such as :py:class:`cobald.controller.linear.LinearController` -or :py:class:`cobald.controller.relative_supply.RelativeSupplyController` and -overwriting :py:meth:`lapis.controller.SimulatedLinearController.run`. In -this way, we enable validation of current TARDIS/COBalD setup as well as simulation -of future extensions. - -Available controller implementations from COBalD in LAPIS are: - -.. autoclass:: lapis.controller.SimulatedLinearController - :members: - -.. autoclass:: lapis.controller.SimulatedRelativeSupplyController - :members: - -And there is also an implementation considered as an extension for COBalD: - -.. autoclass:: lapis.controller.SimulatedCostController - :members: - -Drones -~~~~~~ - -Drones provide instances of the set of resources defined by a given :term:`pool`. -Drones are the only objects in the simulation that are able to process -:term:`jobs `. Simplified, drones represent worker nodes. - -The concept of drones is introduced by TARDIS. A drone is a generalisation of -the pilot concept used for example in High Energy Physics and is a placeholder -for the real workloads to be processed. A drone is expected to autonomously -manage its lifecycle, meaning, that it handles failures and termination -independently from other components within the system. - -.. warning:: - - Drones are not yet fully employed in LAPIS. They already run independently - but do not handle termination themselves. - -Scheduler -~~~~~~~~~ - -The scheduler is the connecting component between the :term:`jobs ` in the -:term:`job queue` and the running :term:`drones `. It does the matchmaking -between :term:`jobs ` and :term:`drones ` to assign the -:term:`jobs ` to the best evaluated :term:`drone`. Whenever a :term:`job` -is assigned to a :term:`drone`, the :term:`job` is removed from the -:term:`job queue`. The scheduler is notified as soon as the :term:`job` is -terminated independent from the state of termination. It is the task of the -scheduler to decide to either remove the :term:`job` from the simulation in case -of success or to re-insert the :term:`job` into the :term:`job queue` to retry -processing. - -LAPIS currently supports an HTCondor-like implementation of a scheduler: - -.. autoclass:: lapis.scheduler.CondorJobScheduler - :members: - -.. warning:: - - The implementation of the HTCondor scheduler is still very rough. - The matchmaking currently does not rely on given ``requirements``, but only - considers required and provided ``resources`` for :term:`jobs ` and - :term:`drones `. The automatic clustering, therefore, also only relies - on the type and number of ``resources`` and is applied to :term:`drones ` - only at the moment. diff --git a/docs/source/topics/overview.rst b/docs/source/topics/overview.rst index fcd80ff..346e63c 100644 --- a/docs/source/topics/overview.rst +++ b/docs/source/topics/overview.rst @@ -6,7 +6,5 @@ This is a collection of separate topics on LAPIS. .. toctree:: :maxdepth: 1 - concept monitoring cli - support diff --git a/docs/source/topics/support.rst b/docs/source/topics/support.rst deleted file mode 100644 index fd35b99..0000000 --- a/docs/source/topics/support.rst +++ /dev/null @@ -1,47 +0,0 @@ -Supported File Formats -====================== - -TARDIS ------- - -.. warning:: - - Import of TARDIS configuration files not supported yet, but will be - available in the future. - -HTCondor --------- - -Job Imports -~~~~~~~~~~~ - -:term:`Jobs ` can be created directly from HTCondor outputs. Via the -``condor_history`` command from HTCondor, ClassAds describing a :term:`jobs ` -requested and used resources can be gathered and saved to a csv file. -To sufficiently describe a :term:`job` for the simulation information about -requested and used resources should be included in the export: - -requested resources: - RequestCpus, RequestWalltime, RequestMemory, RequestDisk - -used resources: - RemoteWallClockTime, MemoryUsage, DiskUsage_RAW, RemoteSysCpu, RemoteUserCpu - -additional job information: - QDate, GlobalJobId - -In the csv file format every line represents a :term:`job`. The columns are -separated by spaces, and comments are marked by simple quotation marks. - -.. note:: - - If information about the input files of a :term:`jobs ` should be passed - to LAPIS, a separate csv file is required. This feature is not provided yet, - but will be added in one of the next versions. - -Input file information of jobs are not part of the standard :term:`jobs ` -ClassAds in HTCondor but can be extracted via external tools (e.g. job submission -tools). - -SWF Format ----------- From 1f8323a9c31ec83f39af41fb5b29afe5bd87d87c Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Fri, 11 Dec 2020 10:09:19 +0100 Subject: [PATCH 605/648] removed archived old scheduler --- lapis/scheduler_old.py | 176 ----------------------------------------- 1 file changed, 176 deletions(-) delete mode 100644 lapis/scheduler_old.py diff --git a/lapis/scheduler_old.py b/lapis/scheduler_old.py deleted file mode 100644 index baab225..0000000 --- a/lapis/scheduler_old.py +++ /dev/null @@ -1,176 +0,0 @@ -from typing import Dict -from usim import Scope, interval, Resources, time - -from lapis.drone import Drone -from lapis.monitor import sampling_required - - -class JobQueue(list): - pass - - -class CondorJobScheduler(object): - """ - Goal of the htcondor job scheduler is to have a scheduler that somehow - mimics how htcondor does schedule jobs. - Htcondor does scheduling based on a priority queue. The priorities itself - are managed by operators of htcondor. - So different instances can apparently behave very different. - - In my case I am going to try building a priority queue that sorts job slots - by increasing cost. The cost itself is calculated based on the current - strategy that is used at GridKa. The scheduler checks if a job either - exactly fits a slot or if it does fit into it several times. The cost for - putting a job at a given slot is given by the amount of resources that - might remain unallocated. - :return: - """ - - def __init__(self, job_queue): - self._stream_queue = job_queue - self.drone_cluster = [] - self.interval = 60 - self.job_queue = JobQueue() - self._collecting = True - self._processing = Resources(jobs=0) - - @property - def drone_list(self): - for cluster in self.drone_cluster: - for drone in cluster: - yield drone - - def register_drone(self, drone: Drone): - self._add_drone(drone) - - def unregister_drone(self, drone: Drone): - for cluster in self.drone_cluster: - try: - cluster.remove(drone) - except ValueError: - pass - else: - if len(cluster) == 0: - self.drone_cluster.remove(cluster) - - def _add_drone(self, drone: Drone, drone_resources: Dict = None): - minimum_distance_cluster = None - distance = float("Inf") - if len(self.drone_cluster) > 0: - for cluster in self.drone_cluster: - current_distance = 0 - for key in {*cluster[0].pool_resources, *drone.pool_resources}: - if drone_resources: - current_distance += abs( - cluster[0].theoretical_available_resources.get(key, 0) - - drone_resources.get(key, 0) - ) - else: - current_distance += abs( - cluster[0].theoretical_available_resources.get(key, 0) - - drone.theoretical_available_resources.get(key, 0) - ) - if current_distance < distance: - minimum_distance_cluster = cluster - distance = current_distance - if distance < 1: - minimum_distance_cluster.append(drone) - else: - self.drone_cluster.append([drone]) - else: - self.drone_cluster.append([drone]) - - def update_drone(self, drone: Drone): - self.unregister_drone(drone) - self._add_drone(drone) - - async def run(self): - async with Scope() as scope: - scope.do(self._collect_jobs()) - async for _ in interval(self.interval): - print("NEW SCHEDULING INTERVAL @ {}".format(time.now)) - print(self.job_queue) - for job in self.job_queue.copy(): - print("SCHEDULING {}".format(repr(job))) - best_match = self._schedule_job(job) - if best_match: - print( - "start job {} on drone {} @ {}".format( - repr(job), repr(best_match), time.now - ) - ) - await best_match.schedule_job(job) - self.job_queue.remove(job) - await sampling_required.put(self.job_queue) - self.unregister_drone(best_match) - left_resources = best_match.theoretical_available_resources - left_resources = { - key: value - job.resources.get(key, 0) - for key, value in left_resources.items() - } - self._add_drone(best_match, left_resources) - if ( - not self._collecting - and not self.job_queue - and self._processing.levels.jobs == 0 - ): - break - await sampling_required.put(self) - - async def _collect_jobs(self): - async for job in self._stream_queue: - self.job_queue.append(job) - await self._processing.increase(jobs=1) - # TODO: logging happens with each job - await sampling_required.put(self.job_queue) - self._collecting = False - - async def job_finished(self, job): - if job.successful: - await self._processing.decrease(jobs=1) - else: - await self._stream_queue.put(job) - - def _schedule_job(self, job) -> Drone: - priorities = {} - for cluster in self.drone_cluster: - drone = cluster[0] - cost = 0 - resources = drone.theoretical_available_resources - # print( - # "trying to match CachingJob {} to {}, resources {}".format( - # repr(job), repr(drone), resources - # ) - # ) - for resource_type in job.resources: - if resources.get(resource_type, 0) < job.resources[resource_type]: - # Inf for all job resources that a drone does not support - # and all resources that are too small to even be considered - cost = float("Inf") - break - else: - try: - cost += 1 / ( - resources[resource_type] // job.resources[resource_type] - ) - except KeyError: - pass - for additional_resource_type in [ - key for key in drone.pool_resources if key not in job.resources - ]: - cost += resources[additional_resource_type] - cost /= len((*job.resources, *drone.pool_resources)) - if cost <= 1: - # directly start job - return drone - try: - priorities[cost].append(drone) - except KeyError: - priorities[cost] = [drone] - try: - minimal_key = min(priorities) - if minimal_key < float("Inf"): - return priorities[minimal_key][0] - except ValueError: - pass - return None From 36ad41ce2a675a8f06e7a4f97b59c049993bc2b9 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Fri, 11 Dec 2020 10:37:32 +0100 Subject: [PATCH 606/648] added missing requirements --- pyproject.toml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index b9afd16..5121fe2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,5 +25,12 @@ packages = [ [tool.poetry.dependencies] python = "^3.6.1" lapis-sim = "^0.4.1" +classad = "^0.4.0" +numpy = "^1.19.4" [tool.poetry.dev-dependencies] +pytest = ">= 4.3.0" +flake8 = "^3.8.4" +flake8-bugbear = "^20.11.1" +black = { version = "^20.8b1", markers = "implementation_name=='cpython'" } +pre-commit = "^2.9.3" From 385706ac2927a21bc1da4f0a31a6f1857cb28a92 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Fri, 11 Dec 2020 10:41:00 +0100 Subject: [PATCH 607/648] removed reference to numpy --- lapis/scheduler.py | 3 +-- pyproject.toml | 1 - 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/lapis/scheduler.py b/lapis/scheduler.py index cc49d97..405b2fe 100644 --- a/lapis/scheduler.py +++ b/lapis/scheduler.py @@ -1,5 +1,6 @@ import random from abc import ABC, abstractmethod +from statistics import mean from typing import Dict, Iterator, Tuple, List, TypeVar, Generic, Set, NamedTuple, Any from weakref import WeakKeyDictionary @@ -17,8 +18,6 @@ from lapis.monitor import sampling_required from lapis.monitor.duplicates import UserDemand -from numpy import mean - from usim import time diff --git a/pyproject.toml b/pyproject.toml index 5121fe2..298be47 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,7 +26,6 @@ packages = [ python = "^3.6.1" lapis-sim = "^0.4.1" classad = "^0.4.0" -numpy = "^1.19.4" [tool.poetry.dev-dependencies] pytest = ">= 4.3.0" From c7784b2cd2ab5ea5d834fb7bd50d5bdad9b3b976 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Fri, 11 Dec 2020 11:32:29 +0100 Subject: [PATCH 608/648] fixed last failing unit tests --- lapis_tests/__init__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lapis_tests/__init__.py b/lapis_tests/__init__.py index 0de4e8b..6f0e72f 100644 --- a/lapis_tests/__init__.py +++ b/lapis_tests/__init__.py @@ -1,5 +1,6 @@ from typing import Callable, Coroutine, Optional from functools import wraps +import gc from usim import run, Resources @@ -34,6 +35,7 @@ async def test_sleep(): @wraps(test_case) def run_test(*args, **kwargs): + gc.collect() # force collecting leftover coroutines test_completed = False async def complete_test_case(): From dc6fc43e44165cd7461071ef677d94d166578667 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Fri, 11 Dec 2020 12:37:40 +0100 Subject: [PATCH 609/648] removed reference to version from documentation --- docs/conf.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index b783cb9..c0380ed 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -15,11 +15,6 @@ # import os # import sys # sys.path.insert(0, os.path.abspath('.')) -import os -import sys - -sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))) -import lapis # noqa: E402 # -- Project information ----------------------------------------------------- @@ -29,7 +24,7 @@ copyright = f"2019-2020 {author}" # The short X.Y version -version = lapis.__version__ +version = "0.1.0" # The full version, including alpha/beta/rc tags release = version From 6ce0d3d3423e25e56bae5527985833d7afd6a14c Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Fri, 11 Dec 2020 12:40:29 +0100 Subject: [PATCH 610/648] removed unnecessary sphinx extensions --- docs/conf.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index c0380ed..3d9af97 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -39,8 +39,6 @@ # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. extensions = [ - "sphinx_automodapi.automodapi", - "sphinx.ext.autosummary", "sphinx.ext.autodoc", "sphinx.ext.intersphinx", "sphinx.ext.todo", From 710a95cfc64b522587197da91f406d4d7b873169 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Fri, 11 Dec 2020 12:43:11 +0100 Subject: [PATCH 611/648] added extras required for building documentation --- pyproject.toml | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index 298be47..901d470 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -27,9 +27,24 @@ python = "^3.6.1" lapis-sim = "^0.4.1" classad = "^0.4.0" +Sphinx = { version = "^3.3.1", optional = true } +sphinx-rtd-theme = { version = "^0.5.0", optional = true } +sphinxcontrib-contentui = { version = "^0.2.5", optional = true } +sphinx-click = { version = "^2.5.0", optional = true } +change-log = { version = "^0.2.0", optional = true } + [tool.poetry.dev-dependencies] pytest = ">= 4.3.0" flake8 = "^3.8.4" flake8-bugbear = "^20.11.1" black = { version = "^20.8b1", markers = "implementation_name=='cpython'" } pre-commit = "^2.9.3" + +[tool.poetry.extras] +doc = [ + "sphinx", + "sphinx_rtd_theme", + "sphinxcontrib-contentui", + "sphinx-click", + "change-log", +] From 7f0520fa19d54e06746d480130dae7b35665f202 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Fri, 11 Dec 2020 12:49:46 +0100 Subject: [PATCH 612/648] added test dependencies to pyproject --- pyproject.toml | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 901d470..dea816f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -33,11 +33,12 @@ sphinxcontrib-contentui = { version = "^0.2.5", optional = true } sphinx-click = { version = "^2.5.0", optional = true } change-log = { version = "^0.2.0", optional = true } +pytest = { version = ">= 4.3.0", optional = true } +flake8 = { version = "^3.8.4", optional = true } +flake8-bugbear = { version = "^20.11.1", optional = true } +black = { version = "^20.8b1", markers = "implementation_name=='cpython'", optional = true } + [tool.poetry.dev-dependencies] -pytest = ">= 4.3.0" -flake8 = "^3.8.4" -flake8-bugbear = "^20.11.1" -black = { version = "^20.8b1", markers = "implementation_name=='cpython'" } pre-commit = "^2.9.3" [tool.poetry.extras] @@ -48,3 +49,9 @@ doc = [ "sphinx-click", "change-log", ] +test = [ + "pytest", + "flake8", + "flake8-bugbear", + "black" +] From 7a050e06b596442fbaee343147c8a88b6ee37e4b Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Fri, 11 Dec 2020 18:21:04 +0100 Subject: [PATCH 613/648] removed job_repr from storageelement --- lapis/caching/connection.py | 6 +-- lapis/{ => caching}/monitor/caching.py | 0 lapis/caching/storageelement.py | 48 ++++++++------------ lapis/interfaces/_storage.py | 8 ++-- lapis/simulator.py | 2 +- lapis_tests/test_caching_hitrate_based.py | 2 +- lapis_tests/test_storage_filebasedhitrate.py | 6 +-- 7 files changed, 30 insertions(+), 42 deletions(-) rename lapis/{ => caching}/monitor/caching.py (100%) diff --git a/lapis/caching/connection.py b/lapis/caching/connection.py index a66accf..10aecbd 100644 --- a/lapis/caching/connection.py +++ b/lapis/caching/connection.py @@ -14,7 +14,7 @@ from lapis.caching.storageelement import StorageElement, RemoteStorage from lapis.caching.files import RequestedFile, RequestedFile_HitrateBased from lapis.monitor import sampling_required -from lapis.monitor.caching import HitrateInfo +from lapis.caching.monitor.caching import HitrateInfo class Connection(object): @@ -115,7 +115,7 @@ def _determine_inputfile_source( if provided_storages is not None: look_up_list = [] for storage in provided_storages: - look_up_list.append(storage.find(requested_file, job_repr)) + look_up_list.append(storage.find(requested_file)) storage_list = sorted( [entry for entry in look_up_list], key=lambda x: x[0], reverse=True ) @@ -162,7 +162,7 @@ async def stream_file( ) except KeyError: pass - await used_connection.transfer(requested_file, job_repr=job_repr) + await used_connection.transfer(requested_file) async def transfer_files(self, drone, requested_files: dict, job_repr): """ diff --git a/lapis/monitor/caching.py b/lapis/caching/monitor/caching.py similarity index 100% rename from lapis/monitor/caching.py rename to lapis/caching/monitor/caching.py diff --git a/lapis/caching/storageelement.py b/lapis/caching/storageelement.py index 2d9c3aa..a229124 100644 --- a/lapis/caching/storageelement.py +++ b/lapis/caching/storageelement.py @@ -67,7 +67,7 @@ async def remove(self, file: StoredFile, **kwargs): """ raise NotImplementedError - def find(self, file: RequestedFile, **kwargs) -> LookUpInformation: + def find(self, file: RequestedFile) -> LookUpInformation: """ All files are contained in remote storage. Therefore no functionality to determine whether the storage contains a certain file is provided. @@ -158,19 +158,18 @@ def used(self): def available(self): return self.size - self.used - async def remove(self, file: StoredFile, job_repr=None): + async def remove(self, file: StoredFile): """ Deletes file from storage object. The time this operation takes is defined by the storages deletion_duration attribute. :param file: representation of the file that is removed from the storage - :param job_repr: Needed for debug output, will be replaced """ await (time + self.deletion_duration) await self._usedstorage.decrease(size=file.filesize) self.files.pop(file.filename) - async def add(self, file: RequestedFile, job_repr=None): + async def add(self, file: RequestedFile): """ Adds file to storage object transferring it through the storage object's connection. This should be sufficient for now because files are only added @@ -179,7 +178,6 @@ async def add(self, file: RequestedFile, job_repr=None): direct file placement this has to be adapted. :param file: representation of the file that is added to the storage - :param job_repr: Needed for debug output, will be replaced """ file = file.convert_to_stored_file_object(time.now) @@ -187,19 +185,18 @@ async def add(self, file: RequestedFile, job_repr=None): self.files[file.filename] = file await self.connection.transfer(file.filesize) - async def _update(self, stored_file: StoredFile, job_repr): + async def _update(self, stored_file: StoredFile): """ Updates a stored files information upon access. :param stored_file: - :param job_repr: Needed for debug output, will be replaced :return: """ await (time + self.update_duration) stored_file.lastaccessed = time.now stored_file.increment_accesses() - async def transfer(self, file: RequestedFile, job_repr=None): + async def transfer(self, file: RequestedFile): """ Manages file transfer via the storage elements connection and updates file information. If the file should have been deleted since it was originally @@ -211,23 +208,19 @@ async def transfer(self, file: RequestedFile, job_repr=None): await self.connection.transfer(file.filesize) try: # TODO: needs handling of KeyError - await self._update(self.files[file.filename], job_repr) + await self._update(self.files[file.filename]) except AttributeError: pass - def find(self, requested_file: RequestedFile, job_repr=None): + def find(self, file: RequestedFile): """ Searches storage object for the requested_file and sends result (amount of cached data, storage object) to the queue. - :param requested_file: - :param job_repr: Needed for debug output, will be replaced :return: (amount of cached data, storage object) """ try: - result = LookUpInformation( - self.files[requested_file.filename].filesize, self - ) + result = LookUpInformation(self.files[file.filename].filesize, self) except KeyError: result = LookUpInformation(0, self) return result @@ -277,7 +270,7 @@ def available(self): def used(self): return 0 - async def transfer(self, file: RequestedFile, job_repr=None): + async def transfer(self, file: RequestedFile): """ Every time a file is requested from this kind of storage, `_hitrate` percent of the file are found on and transferred from this storage. @@ -285,7 +278,6 @@ async def transfer(self, file: RequestedFile, job_repr=None): associated to the hitrate storage. :param file: - :param job_repr: """ async with Scope() as scope: logging.getLogger("implementation").warning( @@ -303,17 +295,17 @@ async def transfer(self, file: RequestedFile, job_repr=None): ) ) - def find(self, requested_file: RequestedFile, job_repr=None): - return LookUpInformation(requested_file.filesize, self) + def find(self, file: RequestedFile): + return LookUpInformation(file.filesize, self) - async def add(self, file: RequestedFile, job_repr=None): + async def add(self, file: RequestedFile): """ As files are not contained explicitly, no functionality to add files is needed """ pass - async def remove(self, file: StoredFile, job_repr=None): + async def remove(self, file: StoredFile): """ As files are not contained explicitly, no functionality to remove files is needed @@ -357,7 +349,7 @@ def available(self): def used(self): return 0 - async def transfer(self, file: RequestedFile_HitrateBased, job_repr=None): + async def transfer(self, file: RequestedFile_HitrateBased): if file.cachehitrate: await self.connection.transfer(total=file.filesize) await sampling_required.put(self.connection) @@ -366,27 +358,23 @@ async def transfer(self, file: RequestedFile_HitrateBased, job_repr=None): print("file is not cached but cache is file source, this should not occur") raise ValueError - def find(self, requested_file: RequestedFile_HitrateBased, job_repr=None): + def find(self, file: RequestedFile_HitrateBased): """ Returns the expectation value for the amount of data of this file that are cached. - :param requested_file: - :param job_repr: :return: result of the lookup """ - return LookUpInformation( - requested_file.filesize * requested_file.cachehitrate, self - ) + return LookUpInformation(file.filesize * file.cachehitrate, self) - async def add(self, file: RequestedFile, job_repr=None): + async def add(self, file: RequestedFile): """ As there is no explicit record of stored files, no functionality to add files is needed """ pass - async def remove(self, file: StoredFile, job_repr=None): + async def remove(self, file: StoredFile): """ As there is no explicit record of stored files, no functionality to remove files is needed diff --git a/lapis/interfaces/_storage.py b/lapis/interfaces/_storage.py index 32e3742..aeab496 100644 --- a/lapis/interfaces/_storage.py +++ b/lapis/interfaces/_storage.py @@ -35,7 +35,7 @@ def used(self) -> int: raise NotImplementedError @abc.abstractmethod - async def transfer(self, file: RequestedFile, job_repr): + async def transfer(self, file: RequestedFile): """ Transfer size of given file via the storages' connection and update file information. If the file was deleted since it was originally looked up @@ -46,7 +46,7 @@ async def transfer(self, file: RequestedFile, job_repr): raise NotImplementedError @abc.abstractmethod - async def add(self, file: RequestedFile, job_repr): + async def add(self, file: RequestedFile): """ Add file information to storage and transfer the size of the file via the storages' connection. @@ -54,13 +54,13 @@ async def add(self, file: RequestedFile, job_repr): raise NotImplementedError @abc.abstractmethod - async def remove(self, file: StoredFile, job_repr): + async def remove(self, file: StoredFile): """ Remove all file information and used file size from the storage. """ raise NotImplementedError @abc.abstractmethod - def find(self, file: RequestedFile, job_repr) -> LookUpInformation: + def find(self, file: RequestedFile) -> LookUpInformation: """Information if a file is stored in Storage""" raise NotImplementedError diff --git a/lapis/simulator.py b/lapis/simulator.py index 2cb7f2b..e5e9437 100644 --- a/lapis/simulator.py +++ b/lapis/simulator.py @@ -12,7 +12,7 @@ from lapis.drone import Drone from lapis.cachingjob import job_to_queue_scheduler from lapis.caching.connection import Connection -from lapis.monitor.caching import ( +from lapis.caching.monitor.caching import ( storage_status, pipe_status, hitrate_evaluation, diff --git a/lapis_tests/test_caching_hitrate_based.py b/lapis_tests/test_caching_hitrate_based.py index ace0d5b..07473d1 100644 --- a/lapis_tests/test_caching_hitrate_based.py +++ b/lapis_tests/test_caching_hitrate_based.py @@ -21,7 +21,7 @@ def test_hitratestorage(self): size = 1000 hitratestorage = HitrateStorage(hitrate=0.5, size=size, files={}) requested_file = RequestedFile(filename="testfile", filesize=100) - looked_up_file = hitratestorage.find(requested_file, job_repr=None) + looked_up_file = hitratestorage.find(requested_file) assert size == hitratestorage.available assert 0 == hitratestorage.used diff --git a/lapis_tests/test_storage_filebasedhitrate.py b/lapis_tests/test_storage_filebasedhitrate.py index f15251c..153dabc 100644 --- a/lapis_tests/test_storage_filebasedhitrate.py +++ b/lapis_tests/test_storage_filebasedhitrate.py @@ -1,5 +1,5 @@ from lapis.caching.storageelement import FileBasedHitrateStorage -from lapis_tests import via_usim, DummyJob +from lapis_tests import via_usim from lapis.caching.files import RequestedFile_HitrateBased from lapis.caching.storageelement import LookUpInformation @@ -28,12 +28,12 @@ async def test_transfer(self): name="name", sitename="site", size=200, throughput_limit=1 ) requestedFile = RequestedFile_HitrateBased("filename", 20, 1) - await filebasedhitratestorage.transfer(requestedFile, DummyJob()) + await filebasedhitratestorage.transfer(requestedFile) assert time.now == 20 with pytest.raises(ValueError): requestedFile = RequestedFile_HitrateBased("filename", 20, 0) - await filebasedhitratestorage.transfer(requestedFile, DummyJob()) + await filebasedhitratestorage.transfer(requestedFile) def test_find_file_in_storage(self): filebasedhitratestorage = FileBasedHitrateStorage( From 79c1e49a393d0ced70528aac5b33f2929064bf2f Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Fri, 11 Dec 2020 18:38:04 +0100 Subject: [PATCH 614/648] changed api of file classes --- lapis/caching/files.py | 17 +++++++---------- lapis/caching/storageelement.py | 5 ++--- 2 files changed, 9 insertions(+), 13 deletions(-) diff --git a/lapis/caching/files.py b/lapis/caching/files.py index bad5f45..5792dd0 100644 --- a/lapis/caching/files.py +++ b/lapis/caching/files.py @@ -38,22 +38,20 @@ def __init__( :param numberofaccesses: number of times the file was accessed """ self.filename = filename - """name of the file """ self.filesize = filesize - """size of the file""" self.storedsize = storedsize or self.filesize - """size of the file that is actually stored""" self.cachedsince = cachedsince - """point in time when the file was cached""" self.lastaccessed = lastaccessed - """time when the file was accessed the last time""" self.numberofaccesses = numberofaccesses - """number of times the file was accessed""" - def increment_accesses(self): + def access(self, access_time: int): """ - Increments number of accesses of a file + Tracks a new access to the file at time `access_time`, including + incrementing the access count. + + :param access_time: time when the file was accessed """ + self.lastaccessed = access_time self.numberofaccesses += 1 @@ -67,13 +65,12 @@ class RequestedFile(NamedTuple): filesize: Optional[int] = None """size of the file""" - def convert_to_stored_file_object(self, currenttime): + def to_stored_file(self, currenttime: int) -> StoredFile: """ Converts a requested file into a stored file :param currenttime: point in time when the conversion takes place """ - print(self.filesize) return StoredFile( self.filename, filesize=self.filesize, diff --git a/lapis/caching/storageelement.py b/lapis/caching/storageelement.py index a229124..5a2c675 100644 --- a/lapis/caching/storageelement.py +++ b/lapis/caching/storageelement.py @@ -180,7 +180,7 @@ async def add(self, file: RequestedFile): :param file: representation of the file that is added to the storage """ - file = file.convert_to_stored_file_object(time.now) + file = file.to_stored_file(time.now) await self._usedstorage.increase(size=file.filesize) self.files[file.filename] = file await self.connection.transfer(file.filesize) @@ -193,8 +193,7 @@ async def _update(self, stored_file: StoredFile): :return: """ await (time + self.update_duration) - stored_file.lastaccessed = time.now - stored_file.increment_accesses() + stored_file.access(access_time=time.now) async def transfer(self, file: RequestedFile): """ From cdce24dd072f9ee61f2f5d4efc157fea7fe9787c Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Mon, 14 Dec 2020 18:55:04 +0100 Subject: [PATCH 615/648] removed further usages of job_repr --- lapis/caching/connection.py | 23 +++++++---------------- 1 file changed, 7 insertions(+), 16 deletions(-) diff --git a/lapis/caching/connection.py b/lapis/caching/connection.py index 10aecbd..0938cd0 100644 --- a/lapis/caching/connection.py +++ b/lapis/caching/connection.py @@ -93,10 +93,7 @@ def add_storage_element(self, storage_element: StorageElement): self.storages[storage_element.sitename] = [storage_element] def _determine_inputfile_source( - self, - requested_file: RequestedFile, - dronesite: Optional[str], - job_repr: Optional[str] = None, + self, requested_file: RequestedFile, dronesite: Optional[str] ) -> Union[StorageElement, RemoteStorage]: """ Collects NamedTuples containing the amount of data of the requested file @@ -108,7 +105,6 @@ def _determine_inputfile_source( :param requested_file: :param dronesite: - :param job_repr: :return: pipe that will be used for file transfer """ provided_storages = self.storages.get(dronesite, None) @@ -125,9 +121,7 @@ def _determine_inputfile_source( return entry.storage return self.remote_connection - async def stream_file( - self, requested_file: RequestedFile, dronesite, job_repr=None - ): + async def stream_file(self, requested_file: RequestedFile, dronesite): """ Determines which storage object is used to provide the requested file and starts the files transfer. For files transferred via remote connection a @@ -136,11 +130,8 @@ async def stream_file( :param requested_file: :param dronesite: - :param job_repr: """ - used_connection = self._determine_inputfile_source( - requested_file, dronesite, job_repr - ) + used_connection = self._determine_inputfile_source(requested_file, dronesite) if self._filebased_caching: if used_connection == self.remote_connection and self.storages.get( dronesite, None @@ -152,11 +143,11 @@ async def stream_file( ) if cache_file: for file in files_for_deletion: - await potential_cache.remove(file, job_repr) - await potential_cache.add(requested_file, job_repr) + await potential_cache.remove(file) + await potential_cache.add(requested_file) else: print( - f"APPLY CACHING DECISION: CachingJob {job_repr}, " + f"APPLY CACHING DECISION: CachingJob, " f"File {requested_file.filename}: File wasnt " f"cached @ {time.now}" ) @@ -215,6 +206,6 @@ async def transfer_files(self, drone, requested_files: dict, job_repr): requested_file = RequestedFile( inputfilename, inputfilespecs["usedsize"] ) - await self.stream_file(requested_file, drone.sitename, job_repr) + await self.stream_file(requested_file, drone.sitename) stream_time = time.now - start_time return stream_time From 2accbcf90172373c2d7b4fa9624adaf1aa281107 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Tue, 15 Dec 2020 10:15:59 +0100 Subject: [PATCH 616/648] added dependency to current lapis branch --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index dea816f..13a7f2d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -24,7 +24,7 @@ packages = [ [tool.poetry.dependencies] python = "^3.6.1" -lapis-sim = "^0.4.1" +lapis-sim = { git = "https://github.com/MatterMiners/lapis.git", branch = "cleanup/caching" } classad = "^0.4.0" Sphinx = { version = "^3.3.1", optional = true } From e0032eff2b25a41de63cd0dfb5b957679df13088 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Wed, 16 Dec 2020 16:49:39 +0100 Subject: [PATCH 617/648] refactored monitor functionality --- custom_simulate.py | 3 +- lapis/caching/connection.py | 4 +- lapis/caching/monitor/caching.py | 172 --------------- lapis/caching/storageelement.py | 4 +- lapis/cachingjob.py | 2 +- lapis/cli/simulate.py | 7 +- lapis/drone.py | 14 +- lapis/monitor/__init__.py | 104 --------- lapis/monitor/{general.py => caching.py} | 263 +++++++++++------------ lapis/monitor/cobald.py | 80 ------- lapis/monitor/duplicates.py | 2 +- lapis/monitor/timefilter.py | 15 ++ lapis/scheduler.py | 2 +- lapis/simulator.py | 49 ++++- lapis_tests/utility/test_monitor.py | 3 +- 15 files changed, 206 insertions(+), 518 deletions(-) delete mode 100644 lapis/caching/monitor/caching.py delete mode 100644 lapis/monitor/__init__.py rename lapis/monitor/{general.py => caching.py} (55%) delete mode 100644 lapis/monitor/cobald.py create mode 100644 lapis/monitor/timefilter.py diff --git a/custom_simulate.py b/custom_simulate.py index a3e19d0..87bc043 100644 --- a/custom_simulate.py +++ b/custom_simulate.py @@ -22,7 +22,8 @@ from lapis.simulator import Simulator -from lapis.monitor import LoggingUDPSocketHandler, SimulationTimeFilter +from lapis.monitor.core import LoggingUDPSocketHandler +from lapis.monitor.timefilter import SimulationTimeFilter from time import time diff --git a/lapis/caching/connection.py b/lapis/caching/connection.py index 0938cd0..9d0390f 100644 --- a/lapis/caching/connection.py +++ b/lapis/caching/connection.py @@ -13,8 +13,8 @@ ) from lapis.caching.storageelement import StorageElement, RemoteStorage from lapis.caching.files import RequestedFile, RequestedFile_HitrateBased -from lapis.monitor import sampling_required -from lapis.caching.monitor.caching import HitrateInfo +from lapis.monitor.core import sampling_required +from lapis.monitor.caching import HitrateInfo class Connection(object): diff --git a/lapis/caching/monitor/caching.py b/lapis/caching/monitor/caching.py deleted file mode 100644 index ce807ea..0000000 --- a/lapis/caching/monitor/caching.py +++ /dev/null @@ -1,172 +0,0 @@ -import logging - -from typing import NamedTuple - -from cobald.monitor.format_json import JsonFormatter -from cobald.monitor.format_line import LineProtocolFormatter - -from lapis.monitor import ( - LoggingSocketHandler, - LoggingUDPSocketHandler, - SIMULATION_START, -) -from lapis.caching.storageelement import StorageElement -from lapis.caching.monitoredpipe import MonitoredPipe, MonitoredPipeInfo - -import time as pytime - - -class HitrateInfo(NamedTuple): - hitrate: float - volume: float - provides_file: int - - -class SimulationInfo(NamedTuple): - input: list - identifier: str - - -def simulation_id(simulationinfo) -> list: - results = [ - { - "input": str(simulationinfo.input), - "id": simulationinfo.identifier, - "time": pytime.ctime(SIMULATION_START), - } - ] - return results - - -simulation_id.name = "simulation_id" -simulation_id.whitelist = (SimulationInfo,) -simulation_id.logging_formatter = { - LoggingSocketHandler.__name__: JsonFormatter(), - # logging.StreamHandler.__name__: JsonFormatter(), - logging.StreamHandler.__name__: LineProtocolFormatter( - tags={"tardis"}, resolution=1e-9 - ), - LoggingUDPSocketHandler.__name__: LineProtocolFormatter( - tags={"tardis"}, resolution=1e-9 - ), -} - - -def hitrate_evaluation(hitrateinfo: HitrateInfo) -> list: - results = [ - { - "hitrate": hitrateinfo.hitrate, - "volume": hitrateinfo.volume / 1000.0 / 1000.0 / 1000.0, - "providesfile": hitrateinfo.provides_file, - } - ] - return results - - -hitrate_evaluation.name = "hitrate_evaluation" -hitrate_evaluation.whitelist = (HitrateInfo,) -hitrate_evaluation.logging_formatter = { - LoggingSocketHandler.__name__: JsonFormatter(), - # logging.StreamHandler.__name__: JsonFormatter(), - logging.StreamHandler.__name__: LineProtocolFormatter( - tags={"tardis"}, resolution=1e-9 - ), - LoggingUDPSocketHandler.__name__: LineProtocolFormatter( - tags={"tardis"}, resolution=1e-9 - ), -} - - -def storage_status(storage: StorageElement) -> list: - """ - Log information about current storage object state - :param storage: - :return: list of records for logging - """ - results = [ - { - "storage": repr(storage), - "usedstorage": storage.used, - "storagesize": storage.size, - "numberoffiles": len(storage.files), - } - ] - return results - - -storage_status.name = "storage_status" -storage_status.whitelist = (StorageElement,) -storage_status.logging_formatter = { - LoggingSocketHandler.__name__: JsonFormatter(), - # logging.StreamHandler.__name__: JsonFormatter(), - logging.StreamHandler.__name__: LineProtocolFormatter( - tags={"tardis", "storage"}, resolution=1e-9 - ), - LoggingUDPSocketHandler.__name__: LineProtocolFormatter( - tags={"tardis", "storage"}, resolution=1e-9 - ), -} - - -def pipe_status(pipeinfo: MonitoredPipeInfo) -> list: - """ - # Log information about the pipes - # :param storage: - # :return: - # """ - results = [ - { - "pipe": repr(pipeinfo.pipename), - "throughput": pipeinfo.available_throughput / 1000.0 / 1000.0 / 1000.0, - "requested_throughput": pipeinfo.requested_throughput - / 1000.0 - / 1000.0 - / 1000.0, - "throughput_scale": pipeinfo.throughputscale, - "no_subscribers": pipeinfo.no_subscriptions, - } - ] - return results - - -pipe_status.name = "pipe_status" -pipe_status.whitelist = (MonitoredPipeInfo,) -pipe_status.logging_formatter = { - LoggingSocketHandler.__name__: JsonFormatter(), - # logging.StreamHandler.__name__: JsonFormatter(), - logging.StreamHandler.__name__: LineProtocolFormatter( - tags={"tardis", "pipe"}, resolution=1e-9 - ), - LoggingUDPSocketHandler.__name__: LineProtocolFormatter( - tags={"tardis", "pipe"}, resolution=1e-9 - ), -} - - -def pipe_data_volume(pipe: MonitoredPipe): - """ - Total amount of data transferred by the pipe up to this point - :param pipe: - :return: - """ - results = [ - { - "pipe": repr(pipe), - "current_total": pipe.transferred_data / 1000.0 / 1000.0 / 1000.0, - } - ] - return results - - -pipe_data_volume.name = "pipe_data_volume" -pipe_data_volume.whitelist = (MonitoredPipe,) -pipe_data_volume.logging_formatter = { - LoggingSocketHandler.__name__: JsonFormatter(), - # logging.StreamHandler.__name__: JsonFormatter(), - logging.StreamHandler.__name__: LineProtocolFormatter( - tags={"tardis", "pipe"}, resolution=1e-9 - ), - LoggingUDPSocketHandler.__name__: LineProtocolFormatter( - tags={"tardis", "pipe"}, resolution=1e-9 - ), -} diff --git a/lapis/caching/storageelement.py b/lapis/caching/storageelement.py index 5a2c675..cb06654 100644 --- a/lapis/caching/storageelement.py +++ b/lapis/caching/storageelement.py @@ -2,7 +2,7 @@ from usim import time, Resources, Scope from lapis.caching.monitoredpipe import MonitoredPipe -from lapis.monitor import sampling_required +from lapis.monitor.core import sampling_required from lapis.caching.files import StoredFile, RequestedFile, RequestedFile_HitrateBased from lapis.interfaces._storage import Storage, LookUpInformation @@ -67,7 +67,7 @@ async def remove(self, file: StoredFile, **kwargs): """ raise NotImplementedError - def find(self, file: RequestedFile) -> LookUpInformation: + def find(self, file: RequestedFile, **kwargs) -> LookUpInformation: """ All files are contained in remote storage. Therefore no functionality to determine whether the storage contains a certain file is provided. diff --git a/lapis/cachingjob.py b/lapis/cachingjob.py index 0421273..59f1857 100644 --- a/lapis/cachingjob.py +++ b/lapis/cachingjob.py @@ -3,7 +3,7 @@ from usim import time, Scope, instant from usim import CancelTask -from lapis.monitor import sampling_required +from lapis.monitor.core import sampling_required from lapis.job import Job if TYPE_CHECKING: diff --git a/lapis/cli/simulate.py b/lapis/cli/simulate.py index 3f9d216..ccbe1ac 100644 --- a/lapis/cli/simulate.py +++ b/lapis/cli/simulate.py @@ -17,11 +17,8 @@ from lapis.scheduler import CondorJobScheduler from lapis.simulator import Simulator -from lapis.monitor import ( - LoggingSocketHandler, - LoggingUDPSocketHandler, - SimulationTimeFilter, -) +from lapis.monitor.core import LoggingSocketHandler, LoggingUDPSocketHandler +from lapis.monitor.timefilter import SimulationTimeFilter last_step = 0 diff --git a/lapis/drone.py b/lapis/drone.py index 53c4d33..e063269 100644 --- a/lapis/drone.py +++ b/lapis/drone.py @@ -1,13 +1,15 @@ from cobald import interfaces from usim import time, Scope, instant, Capacities, ResourcesUnavailable, Queue -from typing import Optional +from typing import Optional, TYPE_CHECKING from lapis.cachingjob import CachingJob -from lapis.caching.connection import Connection from lapis.monitor.duplicates import DroneStatusCaching +if TYPE_CHECKING: + from lapis.caching.connection import Connection + class ResourcesExceeded(Exception): ... @@ -25,7 +27,7 @@ def __init__( scheduling_duration: Optional[float] = None, ignore_resources: list = None, sitename: str = None, - connection: Connection = None, + connection: "Connection" = None, empty: callable = lambda drone: drone.theoretical_available_resources.get( "cores", 1 ) @@ -127,7 +129,7 @@ async def run(self): job queue, these jobs are executed. Starting jobs via a job queue was introduced to avoid errors in resource allocation and monitoring. """ - from lapis.monitor import sampling_required + from lapis.monitor.core import sampling_required await (time + self.scheduling_duration) self._supply = 1 @@ -189,7 +191,7 @@ async def shutdown(self): """ Upon shutdown, the drone unregisters from the scheduler. """ - from lapis.monitor import sampling_required + from lapis.monitor.core import sampling_required self._supply = 0 self.scheduler.unregister_drone(self) @@ -232,7 +234,7 @@ async def _run_job(self, job: CachingJob, kill: bool): """ job.drone = self async with Scope() as scope: - from lapis.monitor import sampling_required + from lapis.monitor.core import sampling_required self._utilisation = self._allocation = None diff --git a/lapis/monitor/__init__.py b/lapis/monitor/__init__.py deleted file mode 100644 index 7e7b1c5..0000000 --- a/lapis/monitor/__init__.py +++ /dev/null @@ -1,104 +0,0 @@ -import copy -import logging -import logging.handlers -from typing import Callable - -from cobald.monitor.format_json import JsonFormatter -from usim import time, Queue -from usim._core.loop import __LOOP_STATE__ - - -SIMULATION_START = None - - -class LoggingSocketHandler(logging.handlers.SocketHandler): - def makePickle(self, record): - return self.format(record).encode() - - -class LoggingUDPSocketHandler(logging.handlers.DatagramHandler): - def makePickle(self, record): - return self.format(record).encode() - - -class SimulationTimeFilter(logging.Filter): - """ - Dummy filter to replace log record timestamp with simulation time. - """ - - def filter(self, record) -> bool: - # record.created = time.now - record.created = time.now + (1e-9 * __LOOP_STATE__.LOOP.turn) - return True - - -sampling_required = Queue() - - -class Monitoring(object): - """ - Enable monitoring of a simulation. Objects that change during simulation are - registered in a queue. Whenever objects in the queue become available, the - monitoring object takes care to dispatch the object to registered statistic - callables taking care to generate relevant monitoring output. - """ - - def __init__(self): - self._statistics = {} - - async def run(self): - # The Queue.__aiter__ cannot safely be finalised unless closed. - # We explicitly create and later on aclose it, to ensure this happens - # when the Scope collects us and the event loop is still around. - log_iter = sampling_required.__aiter__() - try: - async for log_object in log_iter: - for statistic in self._statistics.get(type(log_object), set()): - # do the logging - for record in statistic(log_object): - record["tardis"] = "lapis-%s" % SIMULATION_START - logging.getLogger(statistic.name).info(statistic.name, record) - except GeneratorExit: - await log_iter.aclose() - - def register_statistic(self, statistic: Callable) -> None: - """ - Register a callable that takes an object for logging and generates a list - of records. The callable should have the following accessible attributes: - - name: - The identifying name of the statistic for logging - logging_formatter: - Pre-defined formatters for the different supported logging formats - including socket, stream, and telegraf logging. - whitelist: - A tuple of objects the statistic callable is interested in to create - the required logging messages. - - :param statistic: Callable that returns a list of records for logging - """ - assert hasattr(statistic, "name") and hasattr(statistic, "logging_formatter") - try: - for element in statistic.whitelist: - self._statistics.setdefault(element, set()).add(statistic) - except AttributeError: - logging.getLogger("implementation").warning( - f"Removing statistic {statistic.name} as no whitelist has been defined." - ) - return - - # prepare the logger - logger = logging.getLogger(statistic.name) - if not logger.handlers: - logger.addFilter(SimulationTimeFilter()) - logger.propagate = False - # append handlers of default logger and add required formatters - root_logger = logging.getLogger() - for handler in root_logger.handlers: - new_handler = copy.copy(handler) - new_handler.setFormatter( - statistic.logging_formatter.get( - type(handler).__name__, JsonFormatter() - ) - ) - logger.addHandler(new_handler) diff --git a/lapis/monitor/general.py b/lapis/monitor/caching.py similarity index 55% rename from lapis/monitor/general.py rename to lapis/monitor/caching.py index 4fb297e..0c0dc7d 100644 --- a/lapis/monitor/general.py +++ b/lapis/monitor/caching.py @@ -1,141 +1,181 @@ -from typing import TYPE_CHECKING, List, Dict +import logging -import logging.handlers +from typing import NamedTuple, List, Dict from cobald.monitor.format_json import JsonFormatter from cobald.monitor.format_line import LineProtocolFormatter from lapis.drone import Drone +from lapis.monitor.core import ( + LoggingSocketHandler, + LoggingUDPSocketHandler, + SIMULATION_START, +) +from lapis.caching.storageelement import StorageElement +from lapis.caching.monitoredpipe import MonitoredPipe, MonitoredPipeInfo + +import time as pytime + from lapis.cachingjob import CachingJob -from lapis.monitor import LoggingSocketHandler, LoggingUDPSocketHandler -from lapis.pool import Pool -from lapis.scheduler import CondorJobScheduler, JobQueue -if TYPE_CHECKING: - from lapis.simulator import Simulator +class HitrateInfo(NamedTuple): + hitrate: float + volume: float + provides_file: int -def resource_statistics(drone: Drone) -> List[Dict]: - """ - Log ratio of used and requested resources for drones. - :param drone: the drone - :return: list of records for logging - """ - results = [] - resources = drone.theoretical_available_resources - used_resources = drone.available_resources - for resource_type in resources: - try: - results.append( - { - "resource_type": resource_type, - "pool_configuration": "None", - "pool_type": "drone", - "pool": repr(drone), - "used_ratio": 1 - - used_resources[resource_type] - / drone.pool_resources[resource_type], - "requested_ratio": 1 - - resources[resource_type] / drone.pool_resources[resource_type], - } - ) - except ZeroDivisionError: - results.append( - { - "resource_type": resource_type, - "pool_configuration": "None", - "pool_type": "drone", - "pool": repr(drone), - "used_ratio": 1, - "requested_ratio": 1, - } - ) +class SimulationInfo(NamedTuple): + input: list + identifier: str + + +def simulation_id(simulationinfo) -> list: + results = [ + { + "input": str(simulationinfo.input), + "id": simulationinfo.identifier, + "time": pytime.ctime(SIMULATION_START), + } + ] return results -resource_statistics.name = "resource_status" -resource_statistics.whitelist = (Drone,) -resource_statistics.logging_formatter = { +simulation_id.name = "simulation_id" +simulation_id.whitelist = (SimulationInfo,) +simulation_id.logging_formatter = { LoggingSocketHandler.__name__: JsonFormatter(), # logging.StreamHandler.__name__: JsonFormatter(), logging.StreamHandler.__name__: LineProtocolFormatter( - tags={"tardis", "resource_type", "pool_configuration", "pool_type", "pool"}, - resolution=1e-9, + tags={"tardis"}, resolution=1e-9 ), LoggingUDPSocketHandler.__name__: LineProtocolFormatter( - tags={"tardis", "resource_type", "pool_configuration", "pool_type", "pool"}, - resolution=1e-9, + tags={"tardis"}, resolution=1e-9 ), } -def user_demand(job_queue: JobQueue) -> List[Dict]: - """ - Log global user demand. +def hitrate_evaluation(hitrateinfo: HitrateInfo) -> list: + results = [ + { + "hitrate": hitrateinfo.hitrate, + "volume": hitrateinfo.volume / 1000.0 / 1000.0 / 1000.0, + "providesfile": hitrateinfo.provides_file, + } + ] + return results + + +hitrate_evaluation.name = "hitrate_evaluation" +hitrate_evaluation.whitelist = (HitrateInfo,) +hitrate_evaluation.logging_formatter = { + LoggingSocketHandler.__name__: JsonFormatter(), + # logging.StreamHandler.__name__: JsonFormatter(), + logging.StreamHandler.__name__: LineProtocolFormatter( + tags={"tardis"}, resolution=1e-9 + ), + LoggingUDPSocketHandler.__name__: LineProtocolFormatter( + tags={"tardis"}, resolution=1e-9 + ), +} - :param scheduler: the scheduler + +def storage_status(storage: StorageElement) -> list: + """ + Log information about current storage object state + :param storage: :return: list of records for logging """ - result = [{"value": len(job_queue)}] - return result + results = [ + { + "storage": repr(storage), + "usedstorage": storage.used, + "storagesize": storage.size, + "numberoffiles": len(storage.files), + } + ] + return results -user_demand.name = "user_demand" -user_demand.whitelist = (JobQueue,) -user_demand.logging_formatter = { +storage_status.name = "storage_status" +storage_status.whitelist = (StorageElement,) +storage_status.logging_formatter = { LoggingSocketHandler.__name__: JsonFormatter(), # logging.StreamHandler.__name__: JsonFormatter(), logging.StreamHandler.__name__: LineProtocolFormatter( - tags={"tardis"}, resolution=1e-9 + tags={"tardis", "storage"}, resolution=1e-9 ), LoggingUDPSocketHandler.__name__: LineProtocolFormatter( - tags={"tardis"}, resolution=1e-9 + tags={"tardis", "storage"}, resolution=1e-9 ), } -def job_statistics(scheduler: CondorJobScheduler) -> List[Dict]: +def pipe_status(pipeinfo: MonitoredPipeInfo) -> list: """ - Log number of jobs running in all drones. + # Log information about the pipes + # :param storage: + # :return: + # """ + results = [ + { + "pipe": repr(pipeinfo.pipename), + "throughput": pipeinfo.available_throughput / 1000.0 / 1000.0 / 1000.0, + "requested_throughput": pipeinfo.requested_throughput + / 1000.0 + / 1000.0 + / 1000.0, + "throughput_scale": pipeinfo.throughputscale, + "no_subscribers": pipeinfo.no_subscriptions, + } + ] + return results + - .. Note:: +pipe_status.name = "pipe_status" +pipe_status.whitelist = (MonitoredPipeInfo,) +pipe_status.logging_formatter = { + LoggingSocketHandler.__name__: JsonFormatter(), + # logging.StreamHandler.__name__: JsonFormatter(), + logging.StreamHandler.__name__: LineProtocolFormatter( + tags={"tardis", "pipe"}, resolution=1e-9 + ), + LoggingUDPSocketHandler.__name__: LineProtocolFormatter( + tags={"tardis", "pipe"}, resolution=1e-9 + ), +} - The logging is currently synchronised with the frequency of the - scheduler. If a finer resolution is required, the update of drones - can be considered additionally. - :param scheduler: the scheduler - :return: list of records for logging +def pipe_data_volume(pipe: MonitoredPipe): + """ + Total amount of data transferred by the pipe up to this point + :param pipe: + :return: """ - result = 0 - for drone in scheduler.drone_list: - result += drone.jobs - return [ + results = [ { - "pool_configuration": "None", - "pool_type": "obs", - "pool": repr(scheduler), - "job_count": result, + "pipe": repr(pipe), + "current_total": pipe.transferred_data / 1000.0 / 1000.0 / 1000.0, } ] + return results -job_statistics.name = "cobald_status" -job_statistics.whitelist = (CondorJobScheduler,) -job_statistics.logging_formatter = { +pipe_data_volume.name = "pipe_data_volume" +pipe_data_volume.whitelist = (MonitoredPipe,) +pipe_data_volume.logging_formatter = { LoggingSocketHandler.__name__: JsonFormatter(), # logging.StreamHandler.__name__: JsonFormatter(), logging.StreamHandler.__name__: LineProtocolFormatter( - tags={"tardis", "pool_configuration", "pool_type", "pool"}, resolution=1e-9 + tags={"tardis", "pipe"}, resolution=1e-9 ), LoggingUDPSocketHandler.__name__: LineProtocolFormatter( - tags={"tardis", "pool_configuration", "pool_type", "pool"}, resolution=1e-9 + tags={"tardis", "pipe"}, resolution=1e-9 ), } -def job_events(job: CachingJob) -> List[Dict]: +def extended_job_events(job: CachingJob) -> List[Dict]: """ Log relevant events for jobs. Relevant events are @@ -202,11 +242,10 @@ def job_events(job: CachingJob) -> List[Dict]: return [result] -job_events.name = "job_event" -job_events.whitelist = (CachingJob,) -job_events.logging_formatter = { +extended_job_events.name = "job_event" +extended_job_events.whitelist = (CachingJob,) +extended_job_events.logging_formatter = { LoggingSocketHandler.__name__: JsonFormatter(), - # logging.StreamHandler.__name__: JsonFormatter(), logging.StreamHandler.__name__: LineProtocolFormatter( tags={"tardis", "pool_configuration", "pool_type", "pool", "job", "cached"}, resolution=1e-9, @@ -218,56 +257,7 @@ def job_events(job: CachingJob) -> List[Dict]: } -def pool_status(pool: Pool) -> List[Dict]: - """ - Log state changes of pools and drones. - - :param simulator: the simulator - :return: list of records for logging - """ - return [] - - -pool_status.name = "pool_status" -pool_status.whitelist = (Pool,) -pool_status.logging_formatter = { - LoggingSocketHandler.__name__: JsonFormatter(), - # logging.StreamHandler.__name__: JsonFormatter(), - logging.StreamHandler.__name__: LineProtocolFormatter( - tags={"tardis", "parent_pool", "pool_configuration", "pool_type", "pool"}, - resolution=1e-9, - ), - LoggingUDPSocketHandler.__name__: LineProtocolFormatter( - tags={"tardis", "parent_pool", "pool_configuration", "pool_type", "pool"}, - resolution=1e-9, - ), -} - - -def configuration_information(simulator: "Simulator") -> List[Dict]: - """ - Log information how pools and drones are configured, e.g. provided resources. - - :param simulator: the simulator - :return: list of records for logging - """ - return [] - - -configuration_information.name = "configuration" -configuration_information.logging_formatter = { - LoggingSocketHandler.__name__: JsonFormatter(), - # logging.StreamHandler.__name__: JsonFormatter(), - logging.StreamHandler.__name__: LineProtocolFormatter( - tags={"tardis", "pool_configuration", "resource_type"}, resolution=1e-9 - ), - LoggingUDPSocketHandler.__name__: LineProtocolFormatter( - tags={"tardis", "pool_configuration", "resource_type"}, resolution=1e-9 - ), -} - - -def drone_statistics_caching(drone: Drone) -> List[Dict]: +def drone_statistics_caching(drone: "Drone") -> List[Dict]: """ @@ -293,7 +283,6 @@ def drone_statistics_caching(drone: Drone) -> List[Dict]: drone_statistics_caching.whitelist = (Drone,) drone_statistics_caching.logging_formatter = { LoggingSocketHandler.__name__: JsonFormatter(), - # logging.StreamHandler.__name__: JsonFormatter(), logging.StreamHandler.__name__: LineProtocolFormatter( tags={"tardis", "pool_type", "pool"}, resolution=1e-9 ), diff --git a/lapis/monitor/cobald.py b/lapis/monitor/cobald.py deleted file mode 100644 index 1380651..0000000 --- a/lapis/monitor/cobald.py +++ /dev/null @@ -1,80 +0,0 @@ -import logging - -from cobald.monitor.format_json import JsonFormatter -from cobald.monitor.format_line import LineProtocolFormatter -from typing import List, Dict - -from lapis.drone import Drone -from lapis.monitor import LoggingSocketHandler, LoggingUDPSocketHandler -from lapis.pool import Pool - - -def drone_statistics(drone: Drone) -> List[Dict]: - """ - Collect allocation, utilisation, demand and supply of drones. - - :param drone: the drone - :return: list of records for logging - """ - results = [ - { - "pool_configuration": "None", - "pool_type": "drone", - "pool": repr(drone), - "allocation": drone.allocation, - "utilisation": drone.utilisation, - "demand": drone.demand, - "supply": drone.supply, - "job_count": drone.jobs, - } - ] - return results - - -drone_statistics.name = "cobald_status" -drone_statistics.whitelist = (Drone,) -drone_statistics.logging_formatter = { - LoggingSocketHandler.__name__: JsonFormatter(), - # logging.StreamHandler.__name__: JsonFormatter(), - logging.StreamHandler.__name__: LineProtocolFormatter( - tags={"tardis", "pool_configuration", "pool_type", "pool"}, resolution=1e-9 - ), - LoggingUDPSocketHandler.__name__: LineProtocolFormatter( - tags={"tardis", "pool_configuration", "pool_type", "pool"}, resolution=1e-9 - ), -} - - -def pool_statistics(pool: Pool) -> List[Dict]: - """ - Collect allocation, utilisation, demand and supply of pools. - - :param pool: the pool - :return: list of records to log - """ - results = [ - { - "pool_configuration": "None", - "pool_type": "pool", - "pool": repr(pool), - "allocation": pool.allocation, - "utilisation": pool.utilisation, - "demand": pool.demand, - "supply": pool.supply, - } - ] - return results - - -pool_statistics.name = "cobald_status" -pool_statistics.whitelist = (Pool,) -pool_statistics.logging_formatter = { - LoggingSocketHandler.__name__: JsonFormatter(), - # logging.StreamHandler.__name__: JsonFormatter(), - logging.StreamHandler.__name__: LineProtocolFormatter( - tags={"tardis", "pool_configuration", "pool_type", "pool"}, resolution=1e-9 - ), - LoggingUDPSocketHandler.__name__: LineProtocolFormatter( - tags={"tardis", "pool_configuration", "pool_type", "pool"}, resolution=1e-9 - ), -} diff --git a/lapis/monitor/duplicates.py b/lapis/monitor/duplicates.py index 25de57a..ec1823d 100644 --- a/lapis/monitor/duplicates.py +++ b/lapis/monitor/duplicates.py @@ -1,6 +1,6 @@ import logging.handlers from typing import NamedTuple, List, Dict -from lapis.monitor import LoggingSocketHandler, LoggingUDPSocketHandler +from lapis.monitor.core import LoggingSocketHandler, LoggingUDPSocketHandler from cobald.monitor.format_json import JsonFormatter from cobald.monitor.format_line import LineProtocolFormatter diff --git a/lapis/monitor/timefilter.py b/lapis/monitor/timefilter.py new file mode 100644 index 0000000..b095ff1 --- /dev/null +++ b/lapis/monitor/timefilter.py @@ -0,0 +1,15 @@ +import logging + +from usim import time +from usim._core.loop import __LOOP_STATE__ + + +class SimulationTimeFilter(logging.Filter): + """ + Dummy filter to replace log record timestamp with simulation time. + """ + + def filter(self, record) -> bool: + # record.created = time.now + record.created = time.now + (1e-9 * __LOOP_STATE__.LOOP.turn) + return True diff --git a/lapis/scheduler.py b/lapis/scheduler.py index 405b2fe..da92504 100644 --- a/lapis/scheduler.py +++ b/lapis/scheduler.py @@ -15,7 +15,7 @@ from lapis.drone import Drone from lapis.cachingjob import CachingJob -from lapis.monitor import sampling_required +from lapis.monitor.core import sampling_required from lapis.monitor.duplicates import UserDemand from usim import time diff --git a/lapis/simulator.py b/lapis/simulator.py index e5e9437..368bc6c 100644 --- a/lapis/simulator.py +++ b/lapis/simulator.py @@ -6,18 +6,21 @@ from typing import List from cobald.interfaces import Controller +from cobald.monitor.format_line import LineProtocolFormatter from usim import run, time, until, Scope, Queue -import lapis.monitor as monitor +import lapis.monitor.core as monitor from lapis.drone import Drone from lapis.cachingjob import job_to_queue_scheduler from lapis.caching.connection import Connection -from lapis.caching.monitor.caching import ( +from lapis.monitor.caching import ( storage_status, pipe_status, hitrate_evaluation, simulation_id, pipe_data_volume, + extended_job_events, + drone_statistics_caching, ) from lapis.monitor.general import ( user_demand, @@ -25,8 +28,6 @@ resource_statistics, pool_status, configuration_information, - job_events, - drone_statistics_caching, ) from lapis.monitor.duplicates import user_demand_tmp, drone_statistics_caching_tmp from lapis.monitor.cobald import drone_statistics, pool_statistics @@ -54,14 +55,52 @@ def __init__(self, seed=1234): self.duration = None def enable_monitoring(self): + # configure for caching-specific monitoring + user_demand.logging_formatter[ + logging.StreamHandler.__name__ + ] = LineProtocolFormatter(tags={"tardis"}, resolution=1e-9) + job_statistics.logging_formatter[ + logging.StreamHandler.__name__ + ] = LineProtocolFormatter( + tags={"tardis", "pool_configuration", "pool_type", "pool"}, resolution=1e-9 + ) + pool_statistics.logging_formatter[ + logging.StreamHandler.__name__ + ] = LineProtocolFormatter( + tags={"tardis", "pool_configuration", "pool_type", "pool"}, resolution=1e-9 + ) + drone_statistics.logging_formatter[ + logging.StreamHandler.__name__ + ] = LineProtocolFormatter( + tags={"tardis", "pool_configuration", "pool_type", "pool"}, resolution=1e-9 + ) + resource_statistics.logging_formatter[ + logging.StreamHandler.__name__ + ] = LineProtocolFormatter( + tags={"tardis", "resource_type", "pool_configuration", "pool_type", "pool"}, + resolution=1e-9, + ) + pool_status.logging_formatter[ + logging.StreamHandler.__name__ + ] = LineProtocolFormatter( + tags={"tardis", "parent_pool", "pool_configuration", "pool_type", "pool"}, + resolution=1e-9, + ) + configuration_information.logging_formatter[ + logging.StreamHandler.__name__ + ] = LineProtocolFormatter( + tags={"tardis", "pool_configuration", "resource_type"}, resolution=1e-9 + ) + self.monitoring.register_statistic(user_demand) self.monitoring.register_statistic(job_statistics) - self.monitoring.register_statistic(job_events) self.monitoring.register_statistic(pool_statistics) self.monitoring.register_statistic(drone_statistics) self.monitoring.register_statistic(resource_statistics) self.monitoring.register_statistic(pool_status) self.monitoring.register_statistic(configuration_information) + # caching related statistics + self.monitoring.register_statistic(extended_job_events) self.monitoring.register_statistic(storage_status) self.monitoring.register_statistic(pipe_status) self.monitoring.register_statistic(drone_statistics_caching) diff --git a/lapis_tests/utility/test_monitor.py b/lapis_tests/utility/test_monitor.py index af009e8..2ef9386 100644 --- a/lapis_tests/utility/test_monitor.py +++ b/lapis_tests/utility/test_monitor.py @@ -10,7 +10,8 @@ from . import make_test_logger from lapis.monitor.general import resource_statistics -from lapis.monitor import SimulationTimeFilter, Monitoring +from lapis.monitor.core import Monitoring +from lapis.monitor.timefilter import SimulationTimeFilter def parse_line_protocol(literal: str): From c3a5436934e9fe863862a910e7439941fc446bcf Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Wed, 16 Dec 2020 21:19:26 +0100 Subject: [PATCH 618/648] deleted pool and optimised usage of connection --- lapis/pool.py | 164 -------------------------------------- lapis/pool_io/htcondor.py | 5 +- 2 files changed, 1 insertion(+), 168 deletions(-) delete mode 100644 lapis/pool.py diff --git a/lapis/pool.py b/lapis/pool.py deleted file mode 100644 index ce95191..0000000 --- a/lapis/pool.py +++ /dev/null @@ -1,164 +0,0 @@ -from functools import partial -from typing import Generator, Callable -from cobald import interfaces -from usim import eternity, Scope, interval - -from lapis.caching.connection import Connection -from .drone import Drone - - -class Pool(interfaces.Pool): - """ - A pool encapsulating a number of pools or drones. Given a specific demand, - allocation and utilisation, the pool is able to adapt in terms of number of - drones providing the given resources. - - :param capacity: Maximum number of pools that can be instantiated within the pool - :param init: Number of pools to instantiate at creation time of the pool - :param name: Name of the pool - :param make_drone: Callable to create a drone with specific properties for this pool - """ - - def __init__( - self, - make_drone: Callable, - *, - capacity: int = float("inf"), - init: int = 0, - name: str = None, - connection: Connection = None, - ): - super(Pool, self).__init__() - assert init <= capacity - self._drones = [] - self._demand = 1 - self._level = init - self._capacity = capacity - self._name = name - # TODO: Should drones have access to the pool or the connection directly? - if connection is not None: - self.make_drone = partial(make_drone, connection=connection) - else: - self.make_drone = make_drone - - async def init_pool(self, scope: Scope, init: int = 0): - """ - Initialisation of existing drones at creation time of pool. - - :param init: Number of drones to create. - """ - for _ in range(init): - drone = self.make_drone(0) - scope.do(drone.run()) - self._drones.append(drone) - - # TODO: the run method currently needs to be called manually - async def run(self): - """ - Pool periodically checks the current demand and provided drones. - If demand is higher than the current level, the pool takes care of - initialising new drones. Otherwise drones get removed. - """ - async with Scope() as scope: - await self.init_pool(scope=scope, init=self._level) - async for _ in interval(1): - drones_required = min(self._demand, self._capacity) - self._level - while drones_required > 0: - drones_required -= 1 - # start a new drone - drone = self.make_drone(10) - scope.do(drone.run()) - self._drones.append(drone) - self._level += 1 - if drones_required < 0: - for drone in self.drones: - if drone.jobs == 0: - drones_required += 1 - self._level -= 1 - self._drones.remove(drone) - scope.do(drone.shutdown()) - if drones_required == 0: - break - - @property - def drones(self) -> Generator[Drone, None, None]: - for drone in self._drones: - if drone.supply > 0: - yield drone - - def drone_demand(self) -> int: - return len(self._drones) - - @property - def allocation(self) -> float: - allocations = [] - for drone in self._drones: - allocations.append(drone.allocation) - try: - return sum(allocations) / len(allocations) - except ZeroDivisionError: - return 1 - - @property - def utilisation(self) -> float: - utilisations = [] - for drone in self._drones: - utilisations.append(drone.utilisation) - try: - return sum(utilisations) / len(utilisations) - except ZeroDivisionError: - return 1 - - @property - def supply(self) -> float: - supply = 0 - for drone in self._drones: - supply += drone.supply - return supply - - @property - def demand(self) -> float: - return self._demand - - @demand.setter - def demand(self, value: float): - if value > 0: - self._demand = value - else: - self._demand = 0 - - def __repr__(self): - return "<%s: %s>" % (self.__class__.__name__, self._name or id(self)) - - -class StaticPool(Pool): - """ - A static pool does not react on changing conditions regarding demand, - allocation and utilisation but instead initialises the `capacity` of given - drones with initialised `resources`. - - :param capacity: Maximum number of pools that can be instantiated within - the pool - :param resources: Dictionary of resources available for each pool - instantiated within the pool - """ - - def __init__( - self, make_drone: Callable, capacity: int = 0, connection: Connection = None - ): - assert capacity > 0, "Static pool was initialised without any resources..." - super(StaticPool, self).__init__( - capacity=capacity, - init=capacity, - make_drone=make_drone, - connection=connection, - ) - self._demand = capacity - - async def run(self): - """ - Pool runs forever and does not check if number of drones needs to be adapted. - """ - async with Scope() as scope: - await self.init_pool(scope=scope, init=self._level) - await eternity diff --git a/lapis/pool_io/htcondor.py b/lapis/pool_io/htcondor.py index 6fb3222..a1e25c9 100644 --- a/lapis/pool_io/htcondor.py +++ b/lapis/pool_io/htcondor.py @@ -3,8 +3,7 @@ from typing import Callable -from lapis.caching.connection import Connection -from ..pool import Pool +from lapis.pool import Pool def htcondor_pool_reader( @@ -21,7 +20,6 @@ def htcondor_pool_reader( }, pool_type: Callable = Pool, make_drone: Callable = None, - connection: Connection = None, ): """ Load a pool configuration that was exported via htcondor from files or @@ -53,5 +51,4 @@ def htcondor_pool_reader( ignore_resources=["disk"], sitename=row.get("sitename", None), ), - connection=connection, ) From 884e17f5c39493f2f92ba7ac221456135d8294da Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Wed, 16 Dec 2020 21:23:13 +0100 Subject: [PATCH 619/648] removed machines from pool_io --- lapis/pool_io/machines.py | 45 --------------------------------------- 1 file changed, 45 deletions(-) delete mode 100644 lapis/pool_io/machines.py diff --git a/lapis/pool_io/machines.py b/lapis/pool_io/machines.py deleted file mode 100644 index 38e0e94..0000000 --- a/lapis/pool_io/machines.py +++ /dev/null @@ -1,45 +0,0 @@ -import csv -from functools import partial - -from typing import Callable -from ..pool import Pool - - -def machines_pool_reader( - iterable, - resource_name_mapping: dict = { # noqa: B006 - "cores": "CPUs_per_node", - "memory": "RAM_per_node_in_KB", - }, - unit_conversion_mapping={ # noqa: B006 - "CPUs_per_node": 1, - "RAM_per_node_in_KB": 1000, - }, - pool_type: Callable = Pool, - make_drone: Callable = None, -): - """ - Load a pool configuration that was exported via htcondor from files or - iterables - - :param make_drone: The callable to create the drone - :param iterable: an iterable yielding lines of CSV, such as an open file - :param resource_name_mapping: Mapping from given header names to well-defined - resources in simulation - :param pool_type: The type of pool to be yielded - :return: Yields the :py:class:`StaticPool`s found in the given iterable - """ - assert make_drone - reader = csv.DictReader(iterable, delimiter=" ", skipinitialspace=True) - for row in reader: - yield pool_type( - capacity=int(row["number_of_nodes"]), - make_drone=partial( - make_drone, - { - key: int(float(row[value]) * unit_conversion_mapping.get(value, 1)) - for key, value in resource_name_mapping.items() - }, - ), - name=row["cluster_name"], - ) From 41409832dfa9e026994f605dea347aac7d80cc10 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Wed, 16 Dec 2020 21:29:04 +0100 Subject: [PATCH 620/648] removed controller and cost --- lapis/controller.py | 71 --------------------------------------------- lapis/cost.py | 22 -------------- 2 files changed, 93 deletions(-) delete mode 100644 lapis/controller.py delete mode 100644 lapis/cost.py diff --git a/lapis/controller.py b/lapis/controller.py deleted file mode 100644 index 9a44b46..0000000 --- a/lapis/controller.py +++ /dev/null @@ -1,71 +0,0 @@ -from cobald.controller.linear import LinearController -from cobald.controller.relative_supply import RelativeSupplyController -from cobald.interfaces import Pool -from usim import time - - -class SimulatedLinearController(LinearController): - def __init__( - self, target: Pool, low_utilisation=0.5, high_allocation=0.5, rate=1, interval=1 - ): - super(SimulatedLinearController, self).__init__( - target, low_utilisation, high_allocation, rate, interval - ) - - async def run(self): - while True: - self.regulate(interval=self.interval) - await (time + self.interval) - - -class SimulatedRelativeSupplyController(RelativeSupplyController): - def __init__( - self, - target: Pool, - low_utilisation=0.5, - high_allocation=0.5, - low_scale=0.9, - high_scale=1.1, - interval=1, - ): - super(SimulatedRelativeSupplyController, self).__init__( - target=target, - low_utilisation=low_utilisation, - high_allocation=high_allocation, - low_scale=low_scale, - high_scale=high_scale, - interval=interval, - ) - - async def run(self): - while True: - self.regulate(interval=self.interval) - await (time + self.interval) - - -class SimulatedCostController(SimulatedLinearController): - def __init__( - self, target: Pool, low_utilisation=0.5, high_allocation=0.5, rate=1, interval=1 - ): - self.current_cost = 1 - super(SimulatedCostController, self).__init__( - target, low_utilisation, high_allocation, rate, interval - ) - - def regulate(self, interval): - allocation = 0 - for drone in self.target.drones: - allocation += drone.allocation - if self.target.supply - allocation <= 1: - if self.target.utilisation >= 0.8: - self.target.demand = int(allocation + self.current_cost) - self.current_cost += 1 - else: - self.target.demand = allocation - if self.current_cost > 1: - self.current_cost -= 1 - # self.target.demand = allocation + self.current_cost - # else: - # if self.current_cost > 1: - # self.current_cost -= 1 - # self.target.demand = allocation + self.current_cost diff --git a/lapis/cost.py b/lapis/cost.py deleted file mode 100644 index f391ff7..0000000 --- a/lapis/cost.py +++ /dev/null @@ -1,22 +0,0 @@ -def cobald_cost(simulator): - result = len(list(simulator.job_scheduler.drone_list)) - for drone in simulator.job_scheduler.drone_list: - result += 1 - tmp = 0 - for resource_key in drone.pool_resources: - tmp += drone.resources[resource_key] / drone.pool_resources[resource_key] - tmp /= len(drone.pool_resources) - result -= tmp - return result - - -def local_cobald_cost(pool): - result = 0 - for drone in pool.drones: - result += 1 - tmp = 0 - for resource_key in pool.resources: - tmp += drone.resources[resource_key] / pool.resources[resource_key] - tmp /= len(pool.resources) - result -= tmp - return result From fbbbb407c5f6d7ea6fe57963d48efa0a1fc24e24 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Tue, 27 Apr 2021 14:31:46 +0200 Subject: [PATCH 621/648] adapted documentation --- lapis/caching/monitoredpipe.py | 2 ++ lapis/cachingjob.py | 2 -- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/lapis/caching/monitoredpipe.py b/lapis/caching/monitoredpipe.py index 12e5531..91f5585 100644 --- a/lapis/caching/monitoredpipe.py +++ b/lapis/caching/monitoredpipe.py @@ -29,7 +29,9 @@ def __init__(self, throughput: float): async def load(self) -> AsyncIterable[MonitoredPipeInfo]: """ Monitor any changes of the throughput load of the pipe + .. code:: python3 + async def report_load(pipe: MonitoredPipe): async for event in pipe.load(): print( diff --git a/lapis/cachingjob.py b/lapis/cachingjob.py index 59f1857..92c1aa1 100644 --- a/lapis/cachingjob.py +++ b/lapis/cachingjob.py @@ -115,9 +115,7 @@ def __init__( can be < 1.0 to account for programmatical insufficiencies """ self.resources = resources - """dict containing resources requested by the job""" self.used_resources = used_resources - """dict containing resources actually used by the job""" for key in used_resources: if key not in resources: From 820583da6b109cb8aa9dabebe112a9db3de9d3c7 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Tue, 27 Apr 2021 14:32:56 +0200 Subject: [PATCH 622/648] connection is included as partial now --- lapis/simulator.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/lapis/simulator.py b/lapis/simulator.py index 368bc6c..4c64984 100644 --- a/lapis/simulator.py +++ b/lapis/simulator.py @@ -118,8 +118,7 @@ def create_pools(self, pool_input, pool_reader, pool_type, controller=None): for pool in pool_reader( iterable=pool_input, pool_type=pool_type, - make_drone=partial(Drone, self.job_scheduler), - connection=self.connection, + make_drone=partial(Drone, self.job_scheduler, connection=self.connection), ): self.pools.append(pool) if controller: From 2ba5666b446d5b172912bb045fe51c2cd9b09afe Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Thu, 29 Apr 2021 15:05:19 +0200 Subject: [PATCH 623/648] fixed black issue --- lapis/monitor/caching.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/lapis/monitor/caching.py b/lapis/monitor/caching.py index 0c0dc7d..6d3cf1e 100644 --- a/lapis/monitor/caching.py +++ b/lapis/monitor/caching.py @@ -113,10 +113,10 @@ def storage_status(storage: StorageElement) -> list: def pipe_status(pipeinfo: MonitoredPipeInfo) -> list: """ - # Log information about the pipes - # :param storage: - # :return: - # """ + Log information about the pipes + :param storage: + :return: + """ results = [ { "pipe": repr(pipeinfo.pipename), From 48150fbd4498a1d2936837a8acfcc7df43d45e77 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Thu, 29 Apr 2021 15:02:27 +0200 Subject: [PATCH 624/648] removed unnecessary implementations already available in superclass --- lapis/cachingjob.py | 82 +-------------------------------------------- lapis/simulator.py | 2 +- 2 files changed, 2 insertions(+), 82 deletions(-) diff --git a/lapis/cachingjob.py b/lapis/cachingjob.py index 92c1aa1..24da283 100644 --- a/lapis/cachingjob.py +++ b/lapis/cachingjob.py @@ -1,4 +1,3 @@ -import logging from typing import Optional, TYPE_CHECKING from usim import time, Scope, instant from usim import CancelTask @@ -63,18 +62,8 @@ class CachingJob(Job): """ __slots__ = ( - "resources", - "used_resources", - "walltime", - "requested_walltime", - "queue_date", "requested_inputfiles", "used_inputfiles", - "in_queue_since", - "in_queue_until", - "_name", - "drone", - "_success", "calculation_efficiency", "__weakref__", "_read_from_cache", @@ -96,7 +85,6 @@ def __init__( in_queue_since: float = 0, queue_date: float = 0, name: Optional[str] = None, - drone: "Optional[Drone]" = None, calculation_efficiency: Optional[float] = 1.0, ): """ @@ -114,34 +102,8 @@ def __init__( :param calculation_efficiency: efficiency of the job's calculations, can be < 1.0 to account for programmatical insufficiencies """ - self.resources = resources - self.used_resources = used_resources + super().__init__(resources, used_resources, in_queue_since, queue_date, name) - for key in used_resources: - if key not in resources: - logging.getLogger("implementation").info( - "job uses different resources than specified, added %s: %s", - key, - self.used_resources[key], - ) - self.resources[key] = self.used_resources[key] - self.walltime: int = used_resources.pop("walltime") - """the job's runtime, in reality as well as in the simulation""" - self.requested_walltime: Optional[int] = resources.pop("walltime", None) - """estimate of the job's walltime""" - self.queue_date = queue_date - """ point in time when the job was submitted to the simulated job queue""" - assert in_queue_since >= 0, "Queue time cannot be negative" - self.in_queue_since = in_queue_since - """Time when job was inserted into the queue of the simulation scheduler""" - self.in_queue_until: Optional[float] = None - """point in time when the job left the job queue""" - self.drone = drone - """drone the job is executed on""" - self._name = name - """identifier of the job""" - self._success: Optional[bool] = None - """flag indicating whether the job was completed successfully""" self.calculation_efficiency = calculation_efficiency """efficiency of the job's calculations, can be < 1.0 to account for programmatical insufficiencies""" @@ -205,26 +167,6 @@ def __init__( """number of times the job entered the matchmaking process but was not scheduled to a drone""" - @property - def name(self) -> str: - return self._name or id(self) - - @property - def successful(self) -> Optional[bool]: - return self._success - - @property - def waiting_time(self) -> float: - """ - Determines the time the job spent in the simulated scheduling queue. `Inf` when - the job is still waiting. - - :return: Time in queue - """ - if self.in_queue_until is not None: - return self.in_queue_until - self.in_queue_since - return float("Inf") - async def _calculate(self): """ Determines a jobs calculation time based on the jobs CPU time and a @@ -305,25 +247,3 @@ async def run(self, drone: "Drone"): self.walltime = time.now - start self._success = True await sampling_required.put(self) - - def __repr__(self): - return "<%s: %s>" % (self.__class__.__name__, self._name or id(self)) - - -async def job_to_queue_scheduler(job_generator, job_queue): - """ - Handles reading the simulation's job input and puts the job's into the job queue - - :param job_generator: reader object that yields jobs from input - :param job_queue: queue the jobs are added to - """ - base_date = None - for job in job_generator: - if base_date is None: - base_date = job.queue_date - current_time = job.queue_date - base_date - if time.now < current_time: - await (time >= current_time) - job.in_queue_since = time.now - await job_queue.put(job) - await job_queue.close() diff --git a/lapis/simulator.py b/lapis/simulator.py index 4c64984..a89b042 100644 --- a/lapis/simulator.py +++ b/lapis/simulator.py @@ -11,7 +11,7 @@ import lapis.monitor.core as monitor from lapis.drone import Drone -from lapis.cachingjob import job_to_queue_scheduler +from lapis.job import job_to_queue_scheduler from lapis.caching.connection import Connection from lapis.monitor.caching import ( storage_status, From 28c92968c76c1e3054f745f296049ccd151fc31b Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Thu, 29 Apr 2021 15:17:10 +0200 Subject: [PATCH 625/648] renamed requested_inputfiles to inputfiles --- lapis/cachingjob.py | 6 +++--- lapis/drone.py | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/lapis/cachingjob.py b/lapis/cachingjob.py index 24da283..3671f08 100644 --- a/lapis/cachingjob.py +++ b/lapis/cachingjob.py @@ -62,7 +62,7 @@ class CachingJob(Job): """ __slots__ = ( - "requested_inputfiles", + "inputfiles", "used_inputfiles", "calculation_efficiency", "__weakref__", @@ -108,7 +108,7 @@ def __init__( """efficiency of the job's calculations, can be < 1.0 to account for programmatical insufficiencies""" # caching-related - self.requested_inputfiles = resources.pop("inputfiles", None) + self.inputfiles = resources.pop("inputfiles", None) """dict of input files requested by the job and respective file sizes""" self.used_inputfiles = used_resources.pop("inputfiles", None) """dict of input files read by the job and respective amount of read data""" @@ -182,7 +182,7 @@ async def _calculate(self): result = self.walltime try: if ( - not self.requested_inputfiles + not self.inputfiles or self.drone.connection.remote_connection.connection.throughput == float("Inf") ): diff --git a/lapis/drone.py b/lapis/drone.py index e063269..0531968 100644 --- a/lapis/drone.py +++ b/lapis/drone.py @@ -307,13 +307,13 @@ def look_up_cached_data(self, job: CachingJob): cached_data = 0 caches = self.connection.storages.get(self.sitename, None) if caches: - if job.requested_inputfiles: + if job.inputfiles: cached_data = sum( [ filespecs["hitrates"].get(cache.sitename, 0) * filespecs["filesize"] for cache in caches - for filespecs in job.requested_inputfiles.values() + for filespecs in job.inputfiles.values() ] ) self.cached_data = cached_data From 48af6595d2907b01a50d2417db291c62bc76e7a2 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Thu, 29 Apr 2021 15:42:52 +0200 Subject: [PATCH 626/648] refactored calculate in job into property calculation_time --- lapis/cachingjob.py | 11 ++++------- lapis_tests/test_job_caching.py | 8 ++++---- 2 files changed, 8 insertions(+), 11 deletions(-) diff --git a/lapis/cachingjob.py b/lapis/cachingjob.py index 3671f08..beb4927 100644 --- a/lapis/cachingjob.py +++ b/lapis/cachingjob.py @@ -70,7 +70,6 @@ class CachingJob(Job): "_cached_data", "_total_input_data", "_original_walltime", - "_calculation_time", "_transfer_time", "failed_matches", "cputime", @@ -119,8 +118,6 @@ def __init__( this job""" self._original_walltime = self.walltime """stores the jobs original walltime as a reference""" - self._calculation_time = 0 - """time the job takes only to perform all calculations""" self._transfer_time = 0 """time the job takes only to transfer all input data""" @@ -167,7 +164,8 @@ def __init__( """number of times the job entered the matchmaking process but was not scheduled to a drone""" - async def _calculate(self): + @property + def _calculation_time(self): """ Determines a jobs calculation time based on the jobs CPU time and a calculation efficiency representing inefficient programming. @@ -190,11 +188,10 @@ async def _calculate(self): result = ( self.used_resources["cores"] / self.calculation_efficiency ) * self.walltime - self._calculation_time = result except (KeyError, TypeError): pass - await (time + result) + return result async def _transfer_inputfiles(self): start = time.now @@ -230,7 +227,7 @@ async def run(self, drone: "Drone"): async with Scope() as scope: await instant scope.do(self._transfer_inputfiles()) - scope.do(self._calculate()) + await (time + self._calculation_time) except CancelTask: print("CancelTask") # self.drone = None diff --git a/lapis_tests/test_job_caching.py b/lapis_tests/test_job_caching.py index 04e59e1..90c92ce 100644 --- a/lapis_tests/test_job_caching.py +++ b/lapis_tests/test_job_caching.py @@ -13,7 +13,7 @@ async def test_calculation_time(self): ) self.job.drone = DummyDrone(1) starttime = time.now - await self.job._calculate() + await (time + self.job._calculation_time) assert time.now - starttime == 10 self.job = CachingJob( @@ -22,7 +22,7 @@ async def test_calculation_time(self): ) self.job.drone = DummyDrone(1) starttime = time.now - await self.job._calculate() + await (time + self.job._calculation_time) assert time.now - starttime == 7 self.job = CachingJob( @@ -32,7 +32,7 @@ async def test_calculation_time(self): ) self.job.drone = DummyDrone(1) starttime = time.now - await self.job._calculate() + await (time + self.job._calculation_time) assert time.now - starttime == 14 self.job = CachingJob( @@ -42,7 +42,7 @@ async def test_calculation_time(self): ) self.job.drone = DummyDrone(1) starttime = time.now - await self.job._calculate() + await (time + self.job._calculation_time) assert time.now - starttime == 10 @via_usim From 2d7e5f92da71c7044d4c04f85ed363e6c354465e Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Thu, 29 Apr 2021 21:01:03 +0200 Subject: [PATCH 627/648] adapted implementation of drone and removed unnecessary implementations --- lapis/drone.py | 98 ++++---------------------------------------------- 1 file changed, 7 insertions(+), 91 deletions(-) diff --git a/lapis/drone.py b/lapis/drone.py index 0531968..25be5bb 100644 --- a/lapis/drone.py +++ b/lapis/drone.py @@ -1,6 +1,6 @@ -from cobald import interfaces +from lapis.workernode import WorkerNode -from usim import time, Scope, instant, Capacities, ResourcesUnavailable, Queue +from usim import time, Scope, instant, ResourcesUnavailable from typing import Optional, TYPE_CHECKING from lapis.cachingjob import CachingJob @@ -11,11 +11,7 @@ from lapis.caching.connection import Connection -class ResourcesExceeded(Exception): - ... - - -class Drone(interfaces.Pool): +class Drone(WorkerNode): """ Represents worker nodes in the simulation. """ @@ -47,41 +43,15 @@ def __init__( :param empty: callable that determines whether the drone is currently running any jobs """ - super(Drone, self).__init__() - self.scheduler = scheduler - """scheduler that assigns jobs to the drone""" + super().__init__( + scheduler, pool_resources, scheduling_duration, ignore_resources + ) self.connection = connection """connection object that holds remote connection and handles file transfers""" self.sitename = sitename """identifies the site the drone belongs to, used to determine which caches a drone can use """ - self.pool_resources = pool_resources - """dict stating the drone's resources""" - self.resources = Capacities(**pool_resources) - """available resources, based on the amount of resources requested by - jobs running on the drone """ - # shadowing requested resources to determine jobs to be killed - self.used_resources = Capacities(**pool_resources) - """available resources, based on the amount of resources actually used by - jobs running on the drone""" - if ignore_resources: - self._valid_resource_keys = [ - resource - for resource in self.pool_resources - if resource not in ignore_resources - ] - else: - self._valid_resource_keys = self.pool_resources.keys() - self.scheduling_duration = scheduling_duration - """amount of time that passes between the drone's - start up and it's registration at the scheduler""" - self._supply = 0 - self.jobs = 0 - """number of jobs running on the drone""" - self._allocation = None - self._utilisation = None - self._job_queue = Queue() self._empty = empty """method that is used to determine whether a drone is empty""" @@ -102,26 +72,6 @@ def empty(self): """ return self._empty(self) - @property - def theoretical_available_resources(self): - """ - Returns the amount of resources of the drone that were available if all jobs - used exactly the amount of resources they requested - - :return: dictionary of theoretically available resources - """ - return dict(self.resources.levels) - - @property - def available_resources(self): - """ - Returns the amount of resources of the drone that are available based on the - amount of resources the running jobs actually use. - - :return: dictionary of available resources - """ - return dict(self.used_resources.levels) - async def run(self): """ Handles the drone's activity during simulation. Upon execution the drone @@ -147,30 +97,6 @@ async def run(self): async for job, kill in self._job_queue: scope.do(self._run_job(job=job, kill=kill)) - @property - def supply(self) -> float: - return self._supply - - @property - def demand(self) -> float: - return 1 - - @demand.setter - def demand(self, value: float): - pass # demand is always defined as 1 - - @property - def utilisation(self) -> float: - if self._utilisation is None: - self._init_allocation_and_utilisation() - return self._utilisation - - @property - def allocation(self) -> float: - if self._allocation is None: - self._init_allocation_and_utilisation() - return self._allocation - def _init_allocation_and_utilisation(self): levels = self.resources.levels resources = [] @@ -207,17 +133,7 @@ async def shutdown(self): await (time + 1) - async def schedule_job(self, job: CachingJob, kill: bool = False): - """ - A job is scheduled to a drone by putting it in the drone's job queue. - - :param job: job that was matched to the drone - :param kill: flag, if true jobs can be killed if they use more resources - than they requested - """ - await self._job_queue.put((job, kill)) - - async def _run_job(self, job: CachingJob, kill: bool): + async def _run_job(self, job: CachingJob, kill: bool): # FIXME: needs adaptation """ Method manages to start a job in the context of the given drone. The job is started regardless of the available resources. The resource From b327f885018d0749d0c04695f9d2de03ae8a0b57 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Thu, 29 Apr 2021 21:04:29 +0200 Subject: [PATCH 628/648] introduced TransferStatistic to differentiate data from cache and remote --- lapis/caching/connection.py | 78 ++++++++++++----------- lapis/caching/storageelement.py | 23 +++++-- lapis/cachingjob.py | 14 ++-- lapis/interfaces/_storage.py | 9 ++- lapis_tests/test_caching_hitrate_based.py | 36 +++++++---- 5 files changed, 98 insertions(+), 62 deletions(-) diff --git a/lapis/caching/connection.py b/lapis/caching/connection.py index 9d0390f..9636326 100644 --- a/lapis/caching/connection.py +++ b/lapis/caching/connection.py @@ -1,6 +1,6 @@ import random -from typing import Union, Optional +from typing import Union, Optional, Tuple from usim import Scope, time from lapis.caching.monitoredpipe import MonitoredPipe @@ -14,6 +14,8 @@ from lapis.caching.storageelement import StorageElement, RemoteStorage from lapis.caching.files import RequestedFile, RequestedFile_HitrateBased from lapis.monitor.core import sampling_required + +from lapis.interfaces._storage import TransferStatistics from lapis.monitor.caching import HitrateInfo @@ -121,7 +123,9 @@ def _determine_inputfile_source( return entry.storage return self.remote_connection - async def stream_file(self, requested_file: RequestedFile, dronesite): + async def stream_file( + self, requested_file: RequestedFile, dronesite + ) -> TransferStatistics: """ Determines which storage object is used to provide the requested file and starts the files transfer. For files transferred via remote connection a @@ -153,59 +157,59 @@ async def stream_file(self, requested_file: RequestedFile, dronesite): ) except KeyError: pass - await used_connection.transfer(requested_file) + transfer_statistics = await used_connection.transfer(requested_file) + return transfer_statistics - async def transfer_files(self, drone, requested_files: dict, job_repr): + async def transfer_files( + self, drone, requested_files: dict + ) -> Tuple[int, int, int, int]: """ Converts dict information about requested files to RequestedFile object and sequentially streams all files. :param drone: :param requested_files: - :param job_repr: - :return: time that passed while file was transferred + :return: time that passed while file was transferred, bytes that were + transferred from remote, bytes that were transferred from cache, and + information if files were provided by cache """ - start_time = time.now + requested_bytes = sum([file["usedsize"] for file in requested_files.values()]) + # decision if a jobs inputfiles are cached based on hitrate random_inputfile_information = next(iter(requested_files.values())) + # TODO: does not work in case non-hitrate-based filespecs are given if "hitrates" in random_inputfile_information.keys(): - try: - hitrate = sum( - [ - file["usedsize"] * file["hitrates"].get(drone.sitename, 0.0) - for file in requested_files.values() - ] - ) / sum([file["usedsize"] for file in requested_files.values()]) - provides_file = int(random.random() < hitrate) - - except ZeroDivisionError: - hitrate = 0 - provides_file = 0 - # TODO:: In which cases is hitrate not defined and how can they be covered? I - # think that in this case this code should not be reached but I'm unsure - # right now + cached_bytes = sum( + [ + file["usedsize"] * file["hitrates"].get(drone.sitename, 0.0) + for file in requested_files.values() + ] + ) + # TODO: should be 1 in case of requested_bytes == 0 + hitrate = cached_bytes / requested_bytes if requested_bytes > 0 else 0 + provides_file = int(random.random() < hitrate) + # TODO: In which cases is hitrate not defined and how can they be covered? I + # think that in this case this code should not be reached but I'm unsure + # right now await sampling_required.put( - HitrateInfo( - hitrate, - sum([file["usedsize"] for file in requested_files.values()]), - provides_file, - ) + HitrateInfo(hitrate, requested_bytes, provides_file) ) - job_repr._read_from_cache = provides_file - for inputfilename, inputfilespecs in requested_files.items(): - if "hitrates" in inputfilespecs.keys(): + bytes_from_cache = 0 + bytes_from_remote = 0 + for filename, filespecs in requested_files.items(): + filesize = filespecs["usedsize"] + if "hitrates" in filespecs.keys(): requested_file = RequestedFile_HitrateBased( - inputfilename, inputfilespecs["usedsize"], provides_file + filename, filesize, provides_file ) - else: - requested_file = RequestedFile( - inputfilename, inputfilespecs["usedsize"] - ) - await self.stream_file(requested_file, drone.sitename) + requested_file = RequestedFile(filename, filesize) + transfer_statistics = await self.stream_file(requested_file, drone.sitename) + bytes_from_cache += transfer_statistics.bytes_from_cache + bytes_from_remote += transfer_statistics.bytes_from_remote stream_time = time.now - start_time - return stream_time + return stream_time, bytes_from_remote, bytes_from_cache, provides_file diff --git a/lapis/caching/storageelement.py b/lapis/caching/storageelement.py index cb06654..72fac21 100644 --- a/lapis/caching/storageelement.py +++ b/lapis/caching/storageelement.py @@ -5,7 +5,7 @@ from lapis.monitor.core import sampling_required from lapis.caching.files import StoredFile, RequestedFile, RequestedFile_HitrateBased -from lapis.interfaces._storage import Storage, LookUpInformation +from lapis.interfaces._storage import Storage, LookUpInformation, TransferStatistics import logging @@ -44,7 +44,7 @@ def available(self): def used(self): return 0 - async def transfer(self, file: RequestedFile, **kwargs): + async def transfer(self, file: RequestedFile, **kwargs) -> TransferStatistics: """ Simulates the transfer of a requested file via the remote storage's pipe. @@ -52,6 +52,7 @@ async def transfer(self, file: RequestedFile, **kwargs): """ await self.connection.transfer(total=file.filesize) await sampling_required.put(self.connection) + return TransferStatistics(bytes_from_remote=file.filesize, bytes_from_cache=0) async def add(self, file: StoredFile, **kwargs): """ @@ -204,12 +205,14 @@ async def transfer(self, file: RequestedFile): :param file: :param job_repr: Needed for debug output, will be replaced """ + assert file.filename in self.files, f"File {file.filename} is not on storage" await self.connection.transfer(file.filesize) try: # TODO: needs handling of KeyError await self._update(self.files[file.filename]) except AttributeError: pass + return TransferStatistics(bytes_from_remote=0, bytes_from_cache=file.filesize) def find(self, file: RequestedFile): """ @@ -278,21 +281,26 @@ async def transfer(self, file: RequestedFile): :param file: """ + hitrate_size = self._hitrate * file.filesize async with Scope() as scope: logging.getLogger("implementation").warning( "{} {} @ {} in {}".format( - self._hitrate * file.filesize, - (1 - self._hitrate) * file.filesize, + hitrate_size, + file.filesize - hitrate_size, time.now, file.filename[-30:], ) ) - scope.do(self.connection.transfer(total=self._hitrate * file.filesize)) + scope.do(self.connection.transfer(total=hitrate_size)) scope.do( self.remote_storage.connection.transfer( - total=(1 - self._hitrate) * file.filesize + total=file.filesize - hitrate_size ) ) + return TransferStatistics( + bytes_from_remote=file.filesize - hitrate_size, + bytes_from_cache=hitrate_size, + ) def find(self, file: RequestedFile): return LookUpInformation(file.filesize, self) @@ -321,7 +329,7 @@ class FileBasedHitrateStorage(StorageElement): The definition of the storage objects size is currently irrelevant. # TODO: this storage object has become very intermingled with the connection - module and should be tidied up and restructured! + module and should be tidied up and restructured! """ def __init__( @@ -356,6 +364,7 @@ async def transfer(self, file: RequestedFile_HitrateBased): print("wants to read from remote") print("file is not cached but cache is file source, this should not occur") raise ValueError + return TransferStatistics(bytes_from_remote=0, bytes_from_cache=file.filesize) def find(self, file: RequestedFile_HitrateBased): """ diff --git a/lapis/cachingjob.py b/lapis/cachingjob.py index beb4927..d9ae0c6 100644 --- a/lapis/cachingjob.py +++ b/lapis/cachingjob.py @@ -194,15 +194,21 @@ def _calculation_time(self): return result async def _transfer_inputfiles(self): - start = time.now + transfer_time = 0 try: - await self.drone.connection.transfer_files( - drone=self.drone, requested_files=self.used_inputfiles, job_repr=self + ( + transfer_time, + bytes_from_remote, # FIXME: include somewhere? + bytes_from_cache, # FIXME: include somewhere? + provides_file, + ) = await self.drone.connection.transfer_files( + drone=self.drone, requested_files=self.used_inputfiles ) + self._read_from_cache = provides_file except AttributeError: pass print("end transfer files ", time.now) - self._transfer_time = time.now - start + self._transfer_time = transfer_time async def run(self, drone: "Drone"): """ diff --git a/lapis/interfaces/_storage.py b/lapis/interfaces/_storage.py index aeab496..7dbd292 100644 --- a/lapis/interfaces/_storage.py +++ b/lapis/interfaces/_storage.py @@ -5,6 +5,13 @@ from lapis.caching.files import RequestedFile, StoredFile +class TransferStatistics(NamedTuple): + bytes_from_remote: int + """bytes transferred from remote""" + bytes_from_cache: int + """bytes transferred from cache""" + + class LookUpInformation(NamedTuple): cached_filesize: int storage: "Storage" @@ -35,7 +42,7 @@ def used(self) -> int: raise NotImplementedError @abc.abstractmethod - async def transfer(self, file: RequestedFile): + async def transfer(self, file: RequestedFile) -> TransferStatistics: """ Transfer size of given file via the storages' connection and update file information. If the file was deleted since it was originally looked up diff --git a/lapis_tests/test_caching_hitrate_based.py b/lapis_tests/test_caching_hitrate_based.py index 07473d1..b3d28ed 100644 --- a/lapis_tests/test_caching_hitrate_based.py +++ b/lapis_tests/test_caching_hitrate_based.py @@ -2,7 +2,7 @@ from tempfile import NamedTemporaryFile import json -from lapis_tests import via_usim, DummyDrone, DummyJob +from lapis_tests import via_usim, DummyDrone from lapis.caching.connection import Connection from lapis.caching.storageelement import FileBasedHitrateStorage, HitrateStorage from lapis.storage_io.storage import storage_reader @@ -66,30 +66,35 @@ async def test_stream_file(self): @via_usim async def test_single_transfer_files(self): + hitrate = 0.5 throughput = 10 size = 1000 drone = DummyDrone(throughput) - job = DummyJob(True) requested_files = dict( test=dict(usedsize=100 * conversion_GB_to_B, hitrates={drone.sitename: 1.0}) ) - hitratestorage = HitrateStorage(hitrate=0.5, size=size, files={}) - # does not transfer from cache but from remote storage as there are no files - # in the HitrateStorage + hitratestorage = HitrateStorage(hitrate=hitrate, size=size, files={}) + # half of the data is transferred from remote the other from cache drone.connection.add_storage_element(hitratestorage) - stream_time = await drone.connection.transfer_files( - drone=drone, requested_files=requested_files, job_repr=job + ( + stream_time, + _bytes_from_remote, + bytes_from_cache, + _provides_file, + ) = await drone.connection.transfer_files( + drone=drone, requested_files=requested_files ) assert time.now == 5 assert stream_time == 5 + assert bytes_from_cache == 100 * conversion_GB_to_B * hitrate @via_usim async def test_simultaneous_transfer(self): + hitrate = 0.5 throughput = 10 size = 1000 drone = DummyDrone(throughput) - job = DummyJob(True) requested_files = dict( test1=dict( usedsize=100 * conversion_GB_to_B, hitrates={drone.sitename: 1.0} @@ -98,15 +103,20 @@ async def test_simultaneous_transfer(self): usedsize=200 * conversion_GB_to_B, hitrates={drone.sitename: 1.0} ), ) - hitratestorage = HitrateStorage(hitrate=0.5, size=size, files={}) + hitratestorage = HitrateStorage(hitrate=hitrate, size=size, files={}) drone.connection.add_storage_element(hitratestorage) - # does not transfer from cache but from remote storage as there are no files - # in the HitrateStorage - stream_time = await drone.connection.transfer_files( - drone=drone, requested_files=requested_files, job_repr=job + # half of the data is transferred from remote the other from cache + ( + stream_time, + _bytes_from_remote, + bytes_from_cache, + _provides_file, + ) = await drone.connection.transfer_files( + drone=drone, requested_files=requested_files ) assert time.now == 15 assert stream_time == 15 + assert bytes_from_cache == (100 + 200) * conversion_GB_to_B * hitrate def test_full_simulation_with_hitratebased_caching(self): with NamedTemporaryFile(suffix=".csv") as machine_config, NamedTemporaryFile( From 55a393dbe66308436cd1756a8b5bf249616e242f Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Thu, 6 May 2021 16:02:16 +0200 Subject: [PATCH 629/648] better specified typehints --- lapis/caching/connection.py | 6 +++--- lapis/caching/files.py | 2 +- lapis/caching/monitoredpipe.py | 4 ++-- lapis/caching/storageelement.py | 2 +- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/lapis/caching/connection.py b/lapis/caching/connection.py index 9636326..bb84f7b 100644 --- a/lapis/caching/connection.py +++ b/lapis/caching/connection.py @@ -42,7 +42,7 @@ class Connection(object): "_filebased_caching", ) - def __init__(self, throughput, filebased_caching=True): + def __init__(self, throughput: float, filebased_caching: bool = True): """ Initialization of the connection object :param throughput: throughput of the connection's remote storage @@ -124,7 +124,7 @@ def _determine_inputfile_source( return self.remote_connection async def stream_file( - self, requested_file: RequestedFile, dronesite + self, requested_file: RequestedFile, dronesite: Optional[str] ) -> TransferStatistics: """ Determines which storage object is used to provide the requested file and @@ -162,7 +162,7 @@ async def stream_file( async def transfer_files( self, drone, requested_files: dict - ) -> Tuple[int, int, int, int]: + ) -> Tuple[float, int, int, int]: """ Converts dict information about requested files to RequestedFile object and sequentially streams all files. diff --git a/lapis/caching/files.py b/lapis/caching/files.py index 5792dd0..b5ae1a6 100644 --- a/lapis/caching/files.py +++ b/lapis/caching/files.py @@ -23,7 +23,7 @@ def __init__( storedsize: Optional[int] = None, cachedsince: Optional[int] = None, lastaccessed: Optional[int] = None, - numberofaccesses: Optional[int] = None, + numberofaccesses: int = 0, **filespecs, ): """ diff --git a/lapis/caching/monitoredpipe.py b/lapis/caching/monitoredpipe.py index 91f5585..a6e58d4 100644 --- a/lapis/caching/monitoredpipe.py +++ b/lapis/caching/monitoredpipe.py @@ -20,10 +20,10 @@ def __init__(self, throughput: float): super().__init__(throughput) self._monitor = Notification() self._monitor_buffers: Dict[Any, Deque[MonitoredPipeInfo]] = {} - self.storage = None + self.storage: Optional[str] = None """storage object the pipe simulates the network connection for, for monitoring purposes""" - self.transferred_data = 0 + self.transferred_data: float = 0 """total amount of data transferred by the pipe, for monitoring purposes""" async def load(self) -> AsyncIterable[MonitoredPipeInfo]: diff --git a/lapis/caching/storageelement.py b/lapis/caching/storageelement.py index 72fac21..fa84500 100644 --- a/lapis/caching/storageelement.py +++ b/lapis/caching/storageelement.py @@ -144,7 +144,7 @@ def __init__( connection**""" self.connection.storage = repr(self) - self.remote_storage = None + self.remote_storage: Optional[RemoteStorage] = None """remote storage that provides files that are not stored in the cache""" @property From 052984d3c4dcd6eb6f6196507a67324e687ff6c8 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Thu, 6 May 2021 16:49:11 +0200 Subject: [PATCH 630/648] hitratebased requested file now subclasses requested file --- lapis/caching/files.py | 29 ++++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/lapis/caching/files.py b/lapis/caching/files.py index b5ae1a6..20213cf 100644 --- a/lapis/caching/files.py +++ b/lapis/caching/files.py @@ -1,4 +1,4 @@ -from typing import Optional, NamedTuple +from typing import Optional class StoredFile(object): @@ -55,15 +55,18 @@ def access(self, access_time: int): self.numberofaccesses += 1 -class RequestedFile(NamedTuple): +class RequestedFile(object): """ Representation of a requested file """ - filename: str - """name of the file""" - filesize: Optional[int] = None - """size of the file""" + __slots__ = ("filename", "filesize") + + def __init__(self, filename: str, filesize: Optional[int]) -> None: + """name of the file""" + self.filename = filename + """size of the file""" + self.filesize = filesize def to_stored_file(self, currenttime: int) -> StoredFile: """ @@ -80,16 +83,16 @@ def to_stored_file(self, currenttime: int) -> StoredFile: ) -class RequestedFile_HitrateBased(NamedTuple): +class RequestedFile_HitrateBased(RequestedFile): """ Represents a requested file in hitrate based caching. The cachehitrate flag is somewhat messed up currently. **Its use should be reworked when remodeling the connection module.** """ - filename: str - """name of the requested file""" - filesize: int - """size of the requested file""" - cachehitrate: int - """flag whether the file is cached, 1 if it is cached, 0 if it is not cached""" + __slots__ = "cachehitrate" + + def __init__(self, filename: str, filesize: Optional[int], cachehitrate: int): + super().__init__(filename, filesize) + """flag whether the file is cached, 1 if it is cached, 0 if it is not cached""" + self.cachehitrate = cachehitrate From b0e8dc4f4d74bcf07c27e41739385c76bb77943e Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Thu, 6 May 2021 16:50:47 +0200 Subject: [PATCH 631/648] hitrate 0 if not defined --- lapis/caching/connection.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/lapis/caching/connection.py b/lapis/caching/connection.py index bb84f7b..3f48aca 100644 --- a/lapis/caching/connection.py +++ b/lapis/caching/connection.py @@ -189,7 +189,9 @@ async def transfer_files( ) # TODO: should be 1 in case of requested_bytes == 0 hitrate = cached_bytes / requested_bytes if requested_bytes > 0 else 0 - provides_file = int(random.random() < hitrate) + else: + hitrate = 0 + provides_file = int(random.random() < hitrate) # TODO: In which cases is hitrate not defined and how can they be covered? I # think that in this case this code should not be reached but I'm unsure # right now From 7638ab55e28a120c646c717e469c5664fa5a1ae3 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Thu, 6 May 2021 17:05:00 +0200 Subject: [PATCH 632/648] changed order in which a file is stored on cache --- lapis/caching/storageelement.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/lapis/caching/storageelement.py b/lapis/caching/storageelement.py index fa84500..68c7b82 100644 --- a/lapis/caching/storageelement.py +++ b/lapis/caching/storageelement.py @@ -181,10 +181,9 @@ async def add(self, file: RequestedFile): :param file: representation of the file that is added to the storage """ - file = file.to_stored_file(time.now) await self._usedstorage.increase(size=file.filesize) - self.files[file.filename] = file await self.connection.transfer(file.filesize) + self.files[file.filename] = file.to_stored_file(time.now) async def _update(self, stored_file: StoredFile): """ From 67c8d6d164e4fb5d4d78feaa7e60138d42c5eae9 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Thu, 6 May 2021 17:21:23 +0200 Subject: [PATCH 633/648] adapted typehints for filesizes and time information --- lapis/caching/files.py | 16 ++++++++-------- lapis/caching/storageelement.py | 6 ++++-- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/lapis/caching/files.py b/lapis/caching/files.py index 20213cf..893c153 100644 --- a/lapis/caching/files.py +++ b/lapis/caching/files.py @@ -19,10 +19,10 @@ class StoredFile(object): def __init__( self, filename: str, - filesize: Optional[int] = None, - storedsize: Optional[int] = None, - cachedsince: Optional[int] = None, - lastaccessed: Optional[int] = None, + filesize: int = 0, + storedsize: int = 0, + cachedsince: Optional[float] = None, + lastaccessed: Optional[float] = None, numberofaccesses: int = 0, **filespecs, ): @@ -44,7 +44,7 @@ def __init__( self.lastaccessed = lastaccessed self.numberofaccesses = numberofaccesses - def access(self, access_time: int): + def access(self, access_time: float): """ Tracks a new access to the file at time `access_time`, including incrementing the access count. @@ -62,13 +62,13 @@ class RequestedFile(object): __slots__ = ("filename", "filesize") - def __init__(self, filename: str, filesize: Optional[int]) -> None: + def __init__(self, filename: str, filesize: int) -> None: """name of the file""" self.filename = filename """size of the file""" self.filesize = filesize - def to_stored_file(self, currenttime: int) -> StoredFile: + def to_stored_file(self, currenttime: float) -> StoredFile: """ Converts a requested file into a stored file @@ -92,7 +92,7 @@ class RequestedFile_HitrateBased(RequestedFile): __slots__ = "cachehitrate" - def __init__(self, filename: str, filesize: Optional[int], cachehitrate: int): + def __init__(self, filename: str, filesize: int, cachehitrate: int): super().__init__(filename, filesize) """flag whether the file is cached, 1 if it is cached, 0 if it is not cached""" self.cachehitrate = cachehitrate diff --git a/lapis/caching/storageelement.py b/lapis/caching/storageelement.py index 68c7b82..74b0f6d 100644 --- a/lapis/caching/storageelement.py +++ b/lapis/caching/storageelement.py @@ -102,7 +102,7 @@ def __init__( sitename: Optional[str] = None, size: int = 1000 * 1000 * 1000 * 1000, throughput_limit: int = 10 * 1000 * 1000 * 1000, - files: Optional[dict] = None, + files: Optional[dict[str, StoredFile]] = None, deletion_duration: float = 5, update_duration: float = 1, ): @@ -204,7 +204,9 @@ async def transfer(self, file: RequestedFile): :param file: :param job_repr: Needed for debug output, will be replaced """ - assert file.filename in self.files, f"File {file.filename} is not on storage" + assert ( + file.filename in self.files if self.files else False + ), f"File {file.filename} is not on storage" await self.connection.transfer(file.filesize) try: # TODO: needs handling of KeyError From c7e10254544514a56c835dd0fd9f9cbab3f3950f Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Thu, 6 May 2021 18:33:50 +0200 Subject: [PATCH 634/648] fixed implementation of delete_oldest --- lapis/caching/cachealgorithm.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/lapis/caching/cachealgorithm.py b/lapis/caching/cachealgorithm.py index 20febe4..79df22a 100644 --- a/lapis/caching/cachealgorithm.py +++ b/lapis/caching/cachealgorithm.py @@ -15,15 +15,18 @@ def check_relevance(file: RequestedFile, storage: StorageElement): def delete_oldest( file: RequestedFile, storage: StorageElement -) -> Tuple[bool, Tuple[StoredFile]]: - deletable_files = [] +) -> Tuple[bool, Optional[Tuple[StoredFile, ...]]]: currently_free = storage.available - if currently_free < storage.available: - sorted_files = sort_files_by_cachedsince(storage.files.items()) - while currently_free < file.filesize: - deletable_files.append(next(sorted_files)) - currently_free += deletable_files[-1].filesize - return True, tuple(deletable_files) + if currently_free >= file.filesize: + return True, None + deletable_files = [] + sorted_files = sort_files_by_cachedsince(storage.files.values()) + for current_file in sorted_files: + deletable_files.append(current_file) + currently_free += current_file.filesize + if currently_free >= file.filesize: + return True, tuple(deletable_files) + return False, None def delete_oldest_few_used( From 77e3f567f574672fd1c58b5db4c3dd8b92d10fde Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Thu, 6 May 2021 18:37:21 +0200 Subject: [PATCH 635/648] adapted implementation of delete_oldest_few_used --- lapis/caching/cachealgorithm.py | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/lapis/caching/cachealgorithm.py b/lapis/caching/cachealgorithm.py index 79df22a..40bbecf 100644 --- a/lapis/caching/cachealgorithm.py +++ b/lapis/caching/cachealgorithm.py @@ -31,17 +31,18 @@ def delete_oldest( def delete_oldest_few_used( file: RequestedFile, storage: StorageElement -) -> Tuple[bool, Optional[Tuple[StoredFile]]]: - deletable_files = [] +) -> Tuple[bool, Optional[Tuple[StoredFile, ...]]]: currently_free = storage.available - if currently_free < storage.available: - sorted_files = sort_files_by_cachedsince(storage.files.items()) - for current_file in sorted_files: - if current_file.numberofaccesses < 3: - deletable_files.append(current_file) - currently_free += deletable_files[-1].filesize - if currently_free >= file.filesize: - return True, tuple(deletable_files) + if currently_free >= file.filesize: + return True, None + deletable_files = [] + sorted_files = sort_files_by_cachedsince(storage.files.values()) + for current_file in sorted_files: + if current_file.numberofaccesses < 3: + deletable_files.append(current_file) + currently_free += current_file.filesize + if currently_free >= file.filesize: + return True, tuple(deletable_files) return False, None @@ -54,7 +55,7 @@ def __init__(self, caching_strategy: Callable, deletion_strategy: Callable): def consider( self, file: RequestedFile, storage: StorageElement - ) -> Tuple[bool, Optional[Tuple[StoredFile]]]: + ) -> Tuple[bool, Optional[Tuple[StoredFile, ...]]]: if self._caching_strategy(file, storage): return self._deletion_strategy(file, storage) return False, None From 211a8d907fa36ca36d507384f153b18601dc1723 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Thu, 6 May 2021 19:33:30 +0200 Subject: [PATCH 636/648] files are now required for storage elements --- lapis/caching/storageelement.py | 10 +++++----- lapis_tests/test_storage_filebasedhitrate.py | 17 ++++++++++------- 2 files changed, 15 insertions(+), 12 deletions(-) diff --git a/lapis/caching/storageelement.py b/lapis/caching/storageelement.py index 74b0f6d..ef53184 100644 --- a/lapis/caching/storageelement.py +++ b/lapis/caching/storageelement.py @@ -98,11 +98,11 @@ class StorageElement(Storage): def __init__( self, + files: dict[str, StoredFile], name: Optional[str] = None, sitename: Optional[str] = None, size: int = 1000 * 1000 * 1000 * 1000, throughput_limit: int = 10 * 1000 * 1000 * 1000, - files: Optional[dict[str, StoredFile]] = None, deletion_duration: float = 5, update_duration: float = 1, ): @@ -248,19 +248,19 @@ class HitrateStorage(StorageElement): def __init__( self, + files: dict[str, StoredFile], hitrate, name: Optional[str] = None, sitename: Optional[str] = None, size: int = 1000 * 1000 * 1000 * 1000, throughput_limit: int = 10 * 1000 * 1000 * 1000, - files: Optional[dict] = None, ): super(HitrateStorage, self).__init__( + files=files, name=name, sitename=sitename, size=size, throughput_limit=throughput_limit, - files=files, ) self._hitrate = hitrate """global cache hitrate of this cache""" @@ -335,18 +335,18 @@ class FileBasedHitrateStorage(StorageElement): def __init__( self, + files: dict[str, StoredFile], name: Optional[str] = None, sitename: Optional[str] = None, size: int = 1000 * 1000 * 1000 * 1000, throughput_limit: int = 10 * 1000 * 1000 * 1000, - files: Optional[dict] = None, ): super(FileBasedHitrateStorage, self).__init__( + files={}, name=name, sitename=sitename, size=size, throughput_limit=throughput_limit, - files={}, ) @property diff --git a/lapis_tests/test_storage_filebasedhitrate.py b/lapis_tests/test_storage_filebasedhitrate.py index 153dabc..bd39ad5 100644 --- a/lapis_tests/test_storage_filebasedhitrate.py +++ b/lapis_tests/test_storage_filebasedhitrate.py @@ -11,7 +11,7 @@ class TestFileBasedHitrateStorag: def test_storage_initialization(self): filebasedhitratestorage = FileBasedHitrateStorage( - name="name", sitename="site", size=200, throughput_limit=1 + files={}, name="name", sitename="site", size=200, throughput_limit=1 ) assert filebasedhitratestorage.files == {} assert filebasedhitratestorage.name == "name" @@ -25,7 +25,7 @@ def test_storage_initialization(self): @via_usim async def test_transfer(self): filebasedhitratestorage = FileBasedHitrateStorage( - name="name", sitename="site", size=200, throughput_limit=1 + files={}, name="name", sitename="site", size=200, throughput_limit=1 ) requestedFile = RequestedFile_HitrateBased("filename", 20, 1) await filebasedhitratestorage.transfer(requestedFile) @@ -37,21 +37,24 @@ async def test_transfer(self): def test_find_file_in_storage(self): filebasedhitratestorage = FileBasedHitrateStorage( - name="name", sitename="site", size=200, throughput_limit=1 + files={}, name="name", sitename="site", size=200, throughput_limit=1 ) requestedFile = RequestedFile_HitrateBased("filename", 20, 1) foundFile = LookUpInformation(20, filebasedhitratestorage) assert filebasedhitratestorage.find(requestedFile) == foundFile - def test_modification_of_stored_files(self): + @via_usim + async def test_modification_of_stored_files(self): filebasedhitratestorage = FileBasedHitrateStorage( - name="name", sitename="site", size=200, throughput_limit=1 + files={}, name="name", sitename="site", size=200, throughput_limit=1 ) requestedFile = RequestedFile_HitrateBased("filename", 20, 1) - filebasedhitratestorage.add(requestedFile) + await filebasedhitratestorage.add(requestedFile) assert filebasedhitratestorage.files == {} - filebasedhitratestorage.remove(requestedFile) + stored_file = requestedFile.to_stored_file(time.now) + + await filebasedhitratestorage.remove(stored_file) assert filebasedhitratestorage.files == {} From 14b28122b1fbce5a351ba6987fa2ba96ba0a2d8b Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Thu, 6 May 2021 19:34:12 +0200 Subject: [PATCH 637/648] fixed typehints --- lapis/utilities/cache_cleanup_implementations.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lapis/utilities/cache_cleanup_implementations.py b/lapis/utilities/cache_cleanup_implementations.py index 9892f6a..f453989 100644 --- a/lapis/utilities/cache_cleanup_implementations.py +++ b/lapis/utilities/cache_cleanup_implementations.py @@ -1,10 +1,10 @@ -from typing import List +from typing import List, Iterable from lapis.caching.files import StoredFile -def sort_files_by_cachedsince(stored_files: set) -> List[StoredFile]: - return sorted(list(stored_files), key=lambda x: x.cachedsince) +def sort_files_by_cachedsince(stored_files: Iterable[StoredFile]) -> List[StoredFile]: + return sorted(stored_files, key=lambda x: x.cachedsince) # async def fifo(size, storage): From dc996d344b05a780790719c7a27dda73c48654ba Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Thu, 6 May 2021 19:35:34 +0200 Subject: [PATCH 638/648] calculation_efficiency not optional anymore --- lapis/cachingjob.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapis/cachingjob.py b/lapis/cachingjob.py index d9ae0c6..3429c20 100644 --- a/lapis/cachingjob.py +++ b/lapis/cachingjob.py @@ -84,7 +84,7 @@ def __init__( in_queue_since: float = 0, queue_date: float = 0, name: Optional[str] = None, - calculation_efficiency: Optional[float] = 1.0, + calculation_efficiency: float = 1.0, ): """ Initialization of a job From 82a3afc1f1c42569a37c1412d51d050bc6d0e045 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Thu, 6 May 2021 19:35:55 +0200 Subject: [PATCH 639/648] improved calculation of probabilities --- lapis/cachingjob.py | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/lapis/cachingjob.py b/lapis/cachingjob.py index 3429c20..34e870b 100644 --- a/lapis/cachingjob.py +++ b/lapis/cachingjob.py @@ -139,25 +139,19 @@ def __init__( # TODO: see unit test test_read_with_inputfiles -> making # information about hitrates obilgatory is actually necessary - if self._total_input_data: + if self._total_input_data > 0: self.expectation_cached_data = sum( [ file["usedsize"] * sum(file["hitrates"].values()) for file in self.used_inputfiles.values() ] ) + self.cache_probability = ( + self.expectation_cached_data / self._total_input_data + ) else: self.expectation_cached_data = 0 """amount of data that was read from the cache""" - - if self._total_input_data: - self.cache_probability = sum( - [ - file["usedsize"] * sum(file["hitrates"].values()) - for file in self.used_inputfiles.values() - ] - ) / sum([file["usedsize"] for file in self.used_inputfiles.values()]) - else: self.cache_probability = 0 self.failed_matches = 0 From 32634ace6399b6ef583db8c96ba2af86e02d7079 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Thu, 6 May 2021 19:36:51 +0200 Subject: [PATCH 640/648] drone is optional when scheduling --- lapis/scheduler.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/lapis/scheduler.py b/lapis/scheduler.py index da92504..f0c0007 100644 --- a/lapis/scheduler.py +++ b/lapis/scheduler.py @@ -1,7 +1,18 @@ import random from abc import ABC, abstractmethod from statistics import mean -from typing import Dict, Iterator, Tuple, List, TypeVar, Generic, Set, NamedTuple, Any +from typing import ( + Dict, + Iterator, + Optional, + Tuple, + List, + TypeVar, + Generic, + Set, + NamedTuple, + Any, +) from weakref import WeakKeyDictionary from sortedcontainers import SortedDict @@ -346,7 +357,7 @@ async def job_finished(self, job): else: await self._stream_queue.put(job) - def _schedule_job(self, job) -> Drone: + def _schedule_job(self, job) -> Optional[Drone]: priorities = {} for cluster in self.drone_cluster: drone = cluster[0] From a3f7e0fa508b4ac7e1d0a43daefa915b3df4ac84 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Thu, 27 May 2021 13:51:00 +0200 Subject: [PATCH 641/648] fixed type --- lapis/caching/storageelement.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/lapis/caching/storageelement.py b/lapis/caching/storageelement.py index ef53184..4b97816 100644 --- a/lapis/caching/storageelement.py +++ b/lapis/caching/storageelement.py @@ -1,4 +1,4 @@ -from typing import Optional +from typing import Dict, Optional from usim import time, Resources, Scope from lapis.caching.monitoredpipe import MonitoredPipe @@ -98,7 +98,7 @@ class StorageElement(Storage): def __init__( self, - files: dict[str, StoredFile], + files: Dict[str, StoredFile], name: Optional[str] = None, sitename: Optional[str] = None, size: int = 1000 * 1000 * 1000 * 1000, @@ -248,7 +248,7 @@ class HitrateStorage(StorageElement): def __init__( self, - files: dict[str, StoredFile], + files: Dict[str, StoredFile], hitrate, name: Optional[str] = None, sitename: Optional[str] = None, @@ -335,7 +335,7 @@ class FileBasedHitrateStorage(StorageElement): def __init__( self, - files: dict[str, StoredFile], + files: Dict[str, StoredFile], name: Optional[str] = None, sitename: Optional[str] = None, size: int = 1000 * 1000 * 1000 * 1000, From 36233bc121408625d878513241c9bf4ccec42baa Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Thu, 27 May 2021 16:05:06 +0200 Subject: [PATCH 642/648] improved creation of pools --- custom_simulate.py | 32 ++++++++++++-------------------- 1 file changed, 12 insertions(+), 20 deletions(-) diff --git a/custom_simulate.py b/custom_simulate.py index 87bc043..6873286 100644 --- a/custom_simulate.py +++ b/custom_simulate.py @@ -137,26 +137,18 @@ def ini_and_run( for pool_file in pool_files: with open(pool_file, "r") as pool_file: - pool_file_type = "htcondor" - if "dummycluster" in pool_file.name: - # Attention: dummy_pool_connection is currently not part of - # monitoring as it is not known within the simulator itself - # TODO: do you need this in monitoring? - create_pool_in_simulator( - simulator=simulator, - pool_input=pool_file, - pool_reader=pool_import_mapper[pool_file_type], - pool_type=StaticPool, - connection=dummy_pool_connection, - ) - else: - create_pool_in_simulator( - simulator=simulator, - pool_input=pool_file, - pool_reader=pool_import_mapper[pool_file_type], - pool_type=StaticPool, - connection=simulator.connection, - ) + # Attention: dummy_pool_connection is currently not part of + # monitoring as it is not known within the simulator itself + # TODO: do you need this in monitoring? + create_pool_in_simulator( + simulator=simulator, + pool_input=pool_file, + pool_reader=pool_import_mapper["htcondor"], + pool_type=StaticPool, + connection=dummy_pool_connection + if "dummycluster" in pool_file.name + else simulator.connection, + ) simulator.enable_monitoring() From 5eb03a4f799940fdcf1f9171d1dfff0af7852a70 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Thu, 27 May 2021 16:21:45 +0200 Subject: [PATCH 643/648] fixed loop error --- lapis/monitor/timefilter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapis/monitor/timefilter.py b/lapis/monitor/timefilter.py index b095ff1..cab7a80 100644 --- a/lapis/monitor/timefilter.py +++ b/lapis/monitor/timefilter.py @@ -11,5 +11,5 @@ class SimulationTimeFilter(logging.Filter): def filter(self, record) -> bool: # record.created = time.now - record.created = time.now + (1e-9 * __LOOP_STATE__.LOOP.turn) + record.created = time.now + (1e-9 * __LOOP_STATE__.loop.turn) return True From 3927f673ae7214581e616691002650512370d706 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Thu, 27 May 2021 16:27:01 +0200 Subject: [PATCH 644/648] made flake happy --- lapis_tests/__init__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lapis_tests/__init__.py b/lapis_tests/__init__.py index 6f0e72f..ffd0db8 100644 --- a/lapis_tests/__init__.py +++ b/lapis_tests/__init__.py @@ -84,5 +84,7 @@ def __init__(self, throughput: Optional[float] = None): class DummyJob: + __slots__ = "reads_from_cache" + def __init__(self, reads_from_cache=False): self.reads_from_cache = reads_from_cache From de1fe9c92c100bdbe9add93d26ae56214bd770b9 Mon Sep 17 00:00:00 2001 From: Maximilian Horzela <33454678+HerrHorizontal@users.noreply.github.com> Date: Mon, 7 Jun 2021 12:50:34 +0200 Subject: [PATCH 645/648] Extend CLI to support caching extension (#24) * [CLI] Implement CLI-extension for caching * [Project] Configuring CLI simulate script installer * [CLI] Enable multiple storage_files * [CLI] Simplify by getting rid of subcommands Co-authored-by: Eileen Kuehn Co-authored-by: Max Fischer --- custom_simulate.py | 11 +- lapis/cli/simulate.py | 266 +++++++++++++++++++++--------------------- lapis/scheduler.py | 18 ++- lapis/simulator.py | 2 +- pyproject.toml | 3 + 5 files changed, 152 insertions(+), 148 deletions(-) diff --git a/custom_simulate.py b/custom_simulate.py index 6873286..da71af2 100644 --- a/custom_simulate.py +++ b/custom_simulate.py @@ -19,15 +19,15 @@ ) from lapis.scheduler import CondorClassadJobScheduler -from lapis.simulator import Simulator +from lapis.simulator import Simulator from lapis.monitor.core import LoggingUDPSocketHandler from lapis.monitor.timefilter import SimulationTimeFilter from time import time -pre_job_rank_defaults = "0" +pre_job_rank_default = "0" machine_ad_defaults = """ requirements = target.requestcpus <= my.cpus @@ -35,8 +35,9 @@ """.strip() job_ad_defaults = """ -requirements = my.requestcpus <= target.cpus && my.requestmemory <= target.memory -rank = 0""" + requirements = my.requestcpus <= target.cpus && my.requestmemory <= target.memory + rank = 0 + """.strip() last_step = 0 @@ -75,7 +76,7 @@ def ini_and_run( until=None, calculation_efficiency=1.0, log_telegraf=False, - pre_job_rank=pre_job_rank_defaults, + pre_job_rank=pre_job_rank_default, machine_ads=machine_ad_defaults, job_ads=job_ad_defaults, additional_identifier=None, diff --git a/lapis/cli/simulate.py b/lapis/cli/simulate.py index ccbe1ac..6b82423 100644 --- a/lapis/cli/simulate.py +++ b/lapis/cli/simulate.py @@ -12,9 +12,12 @@ from lapis.pool_io.htcondor import htcondor_pool_reader from lapis.job_io.swf import swf_job_reader from lapis.caching.storageelement import FileBasedHitrateStorage -from lapis.storage_io.storage import storage_reader +from lapis.storage_io.storage import ( + storage_reader, + storage_reader_filebased_hitrate_caching, +) -from lapis.scheduler import CondorJobScheduler +from lapis.scheduler import CondorJobScheduler, CondorClassadJobScheduler from lapis.simulator import Simulator from lapis.monitor.core import LoggingSocketHandler, LoggingUDPSocketHandler @@ -24,27 +27,102 @@ job_import_mapper = {"htcondor": htcondor_job_reader, "swf": swf_job_reader} +scheduler_import_mapper = { + "condor_simplified": CondorJobScheduler, + "condor_classad": CondorClassadJobScheduler, +} + pool_import_mapper = {"htcondor": htcondor_pool_reader} -storage_import_mapper = {"standard": storage_reader} +storage_import_mapper = { + "standard": storage_reader, + "filehitrate": storage_reader_filebased_hitrate_caching, +} -"""Simulation CLI, pay attention to the fact that the random seed is currently set to a -fixed value""" +"""Simulation CLI, pay attention to the fact that there is currently only one +throughput parameter for all storages available""" -@click.group() -@click.option("--seed", type=int, default=1234) +@click.command() +@click.option("--seed", type=int, default=1234, help="random seed") @click.option("--until", type=float) @click.option("--log-tcp", "log_tcp", is_flag=True) @click.option("--log-file", "log_file", type=click.File("w")) @click.option("--log-telegraf", "log_telegraf", is_flag=True) -@click.option("--calculation-efficiency", type=float) -@click.pass_context -def cli(ctx, seed, until, log_tcp, log_file, log_telegraf, calculation_efficiency): - ctx.ensure_object(dict) - ctx.obj["seed"] = seed - ctx.obj["until"] = until - ctx.obj["calculation_efficiency"] = calculation_efficiency +@click.option("--calculation-efficiency", type=float, default=1.0) +@click.option( + "--job-file", + "job_file", + type=(click.File("r"), click.Choice(list(job_import_mapper.keys()))), +) +@click.option("--pre-job-rank", "pre_job_rank", type=str, default=None) +@click.option("--machine-ads", "machine_ads", type=str, default=None) +@click.option("--job-ads", "job_ads", type=str, default=None) +@click.option( + "--scheduler-type", + "scheduler_type", + type=click.Choice(list(scheduler_import_mapper.keys())), +) +@click.option( + "--static-pool-files", + "static_pool_files", + type=(click.File("r"), click.Choice(list(pool_import_mapper.keys()))), + multiple=True, + help="Tuple of `(static_pool_file,static_pool_file_type)`", +) +@click.option( + "--dynamic-pool-files", + "dynamic_pool_files", + type=(click.File("r"), click.Choice(list(pool_import_mapper.keys()))), + multiple=True, + help="Tuple of `(dynamic_pool_file,dynamic_pool_file_type)`", +) +@click.option( + "--storage-files", + "storage_files", + type=( + click.File("r"), + click.File("r"), + click.Choice(list(storage_import_mapper.keys())), + ), + default=(None, None, None), + multiple=True, + help="Tuple of `(storage_file,storage_content_file,storage_type)`", +) +@click.option( + "--remote-throughput", + "remote_throughput", + type=float, + default=1.0, + help="Parameter to set the network bandwidth to remote", +) +@click.option( + "--filebased-caching", + "filebased_caching", + is_flag=True, + help="Flag to set filebased caching on/off", + default=False, +) +@click.option("--cache-hitrate", "cache_hitrate", type=float, default=None) +def cli( + seed, + until, + log_tcp, + log_file, + log_telegraf, + calculation_efficiency, + job_file, + pre_job_rank, + machine_ads, + job_ads, + scheduler_type, + static_pool_files, + dynamic_pool_files, + storage_files, + remote_throughput, + filebased_caching, + cache_hitrate, +): monitoring_logger = logging.getLogger() monitoring_logger.setLevel(logging.DEBUG) time_filter = SimulationTimeFilter() @@ -66,150 +144,74 @@ def cli(ctx, seed, until, log_tcp, log_file, log_telegraf, calculation_efficienc telegrafHandler.setFormatter(LineProtocolFormatter(resolution=1)) monitoring_logger.addHandler(telegrafHandler) + click.echo("starting hybrid environment") -@cli.command() -@click.option( - "--job-file", - "job_file", - type=(click.File("r"), click.Choice(list(job_import_mapper.keys()))), -) -@click.option( - "--pool-file", - "pool_file", - type=(click.File("r"), click.Choice(list(pool_import_mapper.keys()))), - multiple=True, -) -@click.option( - "--storage-files", - "storage_files", - type=( - click.File("r"), - click.File("r"), - click.Choice(list(storage_import_mapper.keys())), - ), - default=(None, None, None), -) -@click.option("--remote-throughput", "remote_throughput", type=float, default=10) -@click.option("--cache-hitrate", "cache_hitrate", type=float, default=None) -@click.pass_context -def static(ctx, job_file, pool_file, storage_files, remote_throughput, cache_hitrate): - click.echo("starting static environment") - simulator = Simulator(seed=ctx.obj["seed"]) - file, file_type = job_file + simulator = Simulator(seed=seed) + infile, file_type = job_file simulator.create_job_generator( - job_input=file, + job_input=infile, job_reader=partial( job_import_mapper[file_type], - calculation_efficiency=ctx.obj["calculation_efficiency"], + calculation_efficiency=calculation_efficiency, ), ) - simulator.create_scheduler(scheduler_type=CondorJobScheduler) - if all(storage_files): - simulator.create_connection_module(remote_throughput, False) - storage_file, storage_content_file, storage_type = storage_files + if scheduler_import_mapper[scheduler_type] == CondorClassadJobScheduler and any( + (pre_job_rank, machine_ads, job_ads) + ): + simulator.job_scheduler = CondorClassadJobScheduler( + job_queue=simulator.job_queue, + pre_job_rank=pre_job_rank, + machine_ad=machine_ads, + job_ad=job_ads, + ) + else: + simulator.create_scheduler( + scheduler_type=scheduler_import_mapper[scheduler_type] + ) + + for current_storage_files in storage_files: + assert all(current_storage_files), "All storage inputs have to be available" + simulator.create_connection_module(remote_throughput, filebased_caching) + storage_file, storage_content_file, storage_type = current_storage_files simulator.create_storage( storage_input=storage_file, storage_content_input=storage_content_file, storage_reader=storage_import_mapper[storage_type], - storage_type=FileBasedHitrateStorage, + storage_type=FileBasedHitrateStorage, # TODO: Generalize this ) - for current_pool in pool_file: + + for current_pool in static_pool_files: pool_file, pool_file_type = current_pool + if "dummycluster" in pool_file.name: + simulator.create_connection_module(float("Inf")) simulator.create_pools( pool_input=pool_file, pool_reader=pool_import_mapper[pool_file_type], pool_type=StaticPool, ) - simulator.enable_monitoring() - simulator.run(until=ctx.obj["until"]) - -@cli.command() -@click.option( - "--job-file", - "job_file", - type=(click.File("r"), click.Choice(list(job_import_mapper.keys()))), -) -@click.option( - "--pool-file", - "pool_file", - type=(click.File("r"), click.Choice(list(pool_import_mapper.keys()))), - multiple=True, -) -@click.pass_context -def dynamic(ctx, job_file, pool_file): - click.echo("starting dynamic environment") - simulator = Simulator(seed=ctx.obj["seed"]) - file, file_type = job_file - simulator.create_job_generator( - job_input=file, - job_reader=partial( - job_import_mapper[file_type], - calculation_efficiency=ctx.obj["calculation_efficiency"], - ), - ) - simulator.create_scheduler(scheduler_type=CondorJobScheduler) - for current_pool in pool_file: - file, file_type = current_pool + for current_pool in dynamic_pool_files: + pool_file, pool_file_type = current_pool + if "dummycluster" in pool_file.name: + simulator.create_connection_module(float("Inf")) simulator.create_pools( - pool_input=file, - pool_reader=pool_import_mapper[file_type], + pool_input=pool_file, + pool_reader=pool_import_mapper[pool_file_type], pool_type=Pool, controller=SimulatedLinearController, ) - simulator.enable_monitoring() - simulator.run(until=ctx.obj["until"]) - -@cli.command() -@click.option( - "--job-file", - "job_file", - type=(click.File("r"), click.Choice(list(job_import_mapper.keys()))), -) -@click.option( - "--static-pool-file", - "static_pool_file", - type=(click.File("r"), click.Choice(list(pool_import_mapper.keys()))), - multiple=True, -) -@click.option( - "--dynamic-pool-file", - "dynamic_pool_file", - type=(click.File("r"), click.Choice(list(pool_import_mapper.keys()))), - multiple=True, -) -@click.pass_context -def hybrid(ctx, job_file, static_pool_file, dynamic_pool_file): - click.echo("starting hybrid environment") - simulator = Simulator(seed=ctx.obj["seed"]) - file, file_type = job_file - simulator.create_job_generator( - job_input=file, - job_reader=partial( - job_import_mapper[file_type], - calculation_efficiency=ctx.obj["calculation_efficiency"], - ), + click.echo( + "scheduler configuration: \n " + f"\tscheduler type: {scheduler_type}\n\n" + f"\tpre job rank: {pre_job_rank} \n\n" + f"\tmachine classads:\n \t{machine_ads}\n\n" + f"\tjob classads: {job_ads}" ) - simulator.create_scheduler(scheduler_type=CondorJobScheduler) - for current_pool in static_pool_file: - file, file_type = current_pool - simulator.create_pools( - pool_input=file, - pool_reader=pool_import_mapper[file_type], - pool_type=StaticPool, - ) - for current_pool in dynamic_pool_file: - file, file_type = current_pool - simulator.create_pools( - pool_input=file, - pool_reader=pool_import_mapper[file_type], - pool_type=Pool, - controller=SimulatedLinearController, - ) + simulator.enable_monitoring() - simulator.run(until=ctx.obj["until"]) + simulator.run(until=until) if __name__ == "__main__": diff --git a/lapis/scheduler.py b/lapis/scheduler.py index f0c0007..a56fae7 100644 --- a/lapis/scheduler.py +++ b/lapis/scheduler.py @@ -246,14 +246,13 @@ async def job_finished(self, job): class CondorJobScheduler(JobScheduler): """ Goal of the htcondor job scheduler is to have a scheduler that somehow - mimics how htcondor does schedule jobs. - Htcondor does scheduling based on a priority queue. The priorities itself + mimics how htcondor schedules jobs. + Htcondor is scheduling based on a priority queue. The priorities itself are managed by operators of htcondor. So different instances can apparently behave very different. - In my case I am going to try building a priority queue that sorts job slots - by increasing cost. The cost itself is calculated based on the current - strategy that is used at GridKa. The scheduler checks if a job either - exactly fits a slot or if it does fit into it several times. The cost for + A priority queue that sorts job slots + by increasing costs is built. The scheduler checks if a job either + exactly fits a slot or if it fits several times. The cost for putting a job at a given slot is given by the amount of resources that might remain unallocated. """ @@ -421,7 +420,7 @@ def __init__(self, quantization: Dict[str, HTCInt], ranking: Expression): @abstractmethod def empty(self) -> bool: - """"Whether there are no resources available""" + """Whether there are no resources available""" raise NotImplementedError @abstractmethod @@ -673,9 +672,8 @@ class CondorClassadJobScheduler(JobScheduler): Htcondor does scheduling based on a priority queue. The priorities itself are managed by operators of htcondor. So different instances can apparently behave very different. - In my case I am going to try building a priority queue that sorts job slots - by increasing cost. The cost itself is calculated based on the current - strategy that is used at GridKa. The scheduler checks if a job either + In this case a priority queue that sorts job slots + by increasing cost is built. The scheduler checks if a job either exactly fits a slot or if it does fit into it several times. The cost for putting a job at a given slot is given by the amount of resources that might remain unallocated. diff --git a/lapis/simulator.py b/lapis/simulator.py index a89b042..15c3a9a 100644 --- a/lapis/simulator.py +++ b/lapis/simulator.py @@ -138,7 +138,7 @@ def create_storage( def create_scheduler(self, scheduler_type): self.job_scheduler = scheduler_type(job_queue=self.job_queue) - def create_connection_module(self, remote_throughput, filebased_caching=True): + def create_connection_module(self, remote_throughput, filebased_caching=False): self.connection = Connection(remote_throughput, filebased_caching) def run(self, until=None): diff --git a/pyproject.toml b/pyproject.toml index 13a7f2d..6656297 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -55,3 +55,6 @@ test = [ "flake8-bugbear", "black" ] + +[tool.poetry.scripts] +simulate = 'lapis.cli.simulate:cli' From b54c99c65aeb36caaff842ac3aba2f0c0ef6dc92 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Mon, 21 Jun 2021 18:52:27 +0200 Subject: [PATCH 646/648] checking all caches for file transfers, closes #11 --- lapis/caching/connection.py | 26 +++++++++++++------------- lapis/caching/storageelement.py | 2 +- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/lapis/caching/connection.py b/lapis/caching/connection.py index 3f48aca..c37e9e0 100644 --- a/lapis/caching/connection.py +++ b/lapis/caching/connection.py @@ -1,6 +1,6 @@ import random -from typing import Union, Optional, Tuple +from typing import Dict, List, Union, Optional, Tuple from usim import Scope, time from lapis.caching.monitoredpipe import MonitoredPipe @@ -48,7 +48,7 @@ def __init__(self, throughput: float, filebased_caching: bool = True): :param throughput: throughput of the connection's remote storage :param filebased_caching: """ - self.storages = dict() + self.storages: Dict[str, Optional[List[StorageElement]]] = {} """dictionary containing storage objects known to the connection module""" self.remote_connection = RemoteStorage(throughput=throughput) """pipe object representing the connection to a remote storage""" @@ -95,7 +95,7 @@ def add_storage_element(self, storage_element: StorageElement): self.storages[storage_element.sitename] = [storage_element] def _determine_inputfile_source( - self, requested_file: RequestedFile, dronesite: Optional[str] + self, requested_file: RequestedFile, dronesite: str ) -> Union[StorageElement, RemoteStorage]: """ Collects NamedTuples containing the amount of data of the requested file @@ -124,7 +124,7 @@ def _determine_inputfile_source( return self.remote_connection async def stream_file( - self, requested_file: RequestedFile, dronesite: Optional[str] + self, requested_file: RequestedFile, dronesite: str ) -> TransferStatistics: """ Determines which storage object is used to provide the requested file and @@ -136,12 +136,11 @@ async def stream_file( :param dronesite: """ used_connection = self._determine_inputfile_source(requested_file, dronesite) - if self._filebased_caching: - if used_connection == self.remote_connection and self.storages.get( - dronesite, None - ): - try: - potential_cache = random.choice(self.storages[dronesite]) + if self._filebased_caching and used_connection == self.remote_connection: + try: + storages = self.storages[dronesite] + if storages: + potential_cache = random.choice(storages) cache_file, files_for_deletion = self.caching_algorithm.consider( file=requested_file, storage=potential_cache ) @@ -155,8 +154,8 @@ async def stream_file( f"File {requested_file.filename}: File wasnt " f"cached @ {time.now}" ) - except KeyError: - pass + except KeyError: + pass transfer_statistics = await used_connection.transfer(requested_file) return transfer_statistics @@ -183,8 +182,9 @@ async def transfer_files( if "hitrates" in random_inputfile_information.keys(): cached_bytes = sum( [ - file["usedsize"] * file["hitrates"].get(drone.sitename, 0.0) + file["usedsize"] * file["hitrates"].get(cache.name, 0.0) for file in requested_files.values() + for cache in self.storages.get(drone.sitename, []) ] ) # TODO: should be 1 in case of requested_bytes == 0 diff --git a/lapis/caching/storageelement.py b/lapis/caching/storageelement.py index 4b97816..390cd08 100644 --- a/lapis/caching/storageelement.py +++ b/lapis/caching/storageelement.py @@ -100,7 +100,7 @@ def __init__( self, files: Dict[str, StoredFile], name: Optional[str] = None, - sitename: Optional[str] = None, + sitename: str = "default", size: int = 1000 * 1000 * 1000 * 1000, throughput_limit: int = 10 * 1000 * 1000 * 1000, deletion_duration: float = 5, From 0aa3e63b7bd300478c77e3f043613a12c43be017 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Mon, 21 Jun 2021 19:32:44 +0200 Subject: [PATCH 647/648] added static tests for gh actions --- .github/workflows/linter.yml | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 .github/workflows/linter.yml diff --git a/.github/workflows/linter.yml b/.github/workflows/linter.yml new file mode 100644 index 0000000..8d36316 --- /dev/null +++ b/.github/workflows/linter.yml @@ -0,0 +1,23 @@ +name: Static Checks + +on: [push, pull_request] + +jobs: + build: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - name: Set up Python + uses: actions/setup-python@v2 + with: + python-version: '3.8' + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install .[test] + - name: Lint with flake8 + run: | + flake8 lapis lapis_tests + - name: Format with black + run: | + black lapis lapis_tests --diff --check From cc9f45f0c751e03e4db6309e7704a897aea66500 Mon Sep 17 00:00:00 2001 From: Eileen Kuehn Date: Mon, 21 Jun 2021 19:52:14 +0200 Subject: [PATCH 648/648] moved unit tests to gh actions --- .github/workflows/unittest.yml | 27 +++++++++++++++++++++++++++ .travis.yml | 27 --------------------------- 2 files changed, 27 insertions(+), 27 deletions(-) create mode 100644 .github/workflows/unittest.yml delete mode 100644 .travis.yml diff --git a/.github/workflows/unittest.yml b/.github/workflows/unittest.yml new file mode 100644 index 0000000..c4d0baa --- /dev/null +++ b/.github/workflows/unittest.yml @@ -0,0 +1,27 @@ +name: Unit Tests + +on: [push, pull_request] + +jobs: + build: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ['pypy3', '3.6', '3.7', '3.8', '3.9'] + + steps: + - uses: actions/checkout@v2 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install .[test] + python -m pip install codecov pytest-cov + - name: Test with pytest + run: | + pytest --cov=./ + - name: Upload coverage to Codecov + uses: codecov/codecov-action@v1 diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index 457dcbf..0000000 --- a/.travis.yml +++ /dev/null @@ -1,27 +0,0 @@ -dist: xenial -language: python -python: - - "3.6" - - "3.7" - - "pypy3" -os: - - linux -# - osx # osx+python installation fails -before_script: - - python -m pip install pip --upgrade - - python -m pip --version - - export PYTHONHASHSEED=${PYTHONHASHSEED:-${RANDOM}} - - echo "export PYTHONHASHSEED=${PYTHONHASHSEED}" - - python -m pip install .[test] - - python -m pip install codecov - - export COVERAGE_PROCESS_START=$(pwd)/.coveragerc - - export PYTEST_ADDOPTS=-v -script: - - python -m flake8 - - | - if [[ $TRAVIS_PYTHON_VERSION != 'pypy3'* ]]; then - python -m black --target-version py36 --check lapis/ lapis_tests/ - fi - - python -m coverage run -m pytest -after_success: - - coverage report && codecov