From e17395a2ba7e04fadf78d400f2003dee3e37d899 Mon Sep 17 00:00:00 2001 From: Hochan Lee Date: Sun, 14 Jan 2024 16:05:10 -0600 Subject: [PATCH 1/5] Fix data dependency --- src/c/backend/include/phases.hpp | 1 + src/c/backend/include/runtime.hpp | 6 + src/c/backend/phases.cpp | 85 +- src/dev.patch | 2498 +++++++++++++++++++++++++++++ src/python/parla/cython/tasks.pyx | 7 +- 5 files changed, 2590 insertions(+), 7 deletions(-) create mode 100644 src/dev.patch diff --git a/src/c/backend/include/phases.hpp b/src/c/backend/include/phases.hpp index 8b3e4a6a..2871c937 100644 --- a/src/c/backend/include/phases.hpp +++ b/src/c/backend/include/phases.hpp @@ -277,6 +277,7 @@ class MemoryReserver : virtual public SchedulerPhase { * @param task The task to create data movement tasks for. */ void create_datamove_tasks(InnerTask *task); + void create_datamove_tasks2(InnerTask *task); }; /** diff --git a/src/c/backend/include/runtime.hpp b/src/c/backend/include/runtime.hpp index de7da5da..88e06018 100644 --- a/src/c/backend/include/runtime.hpp +++ b/src/c/backend/include/runtime.hpp @@ -293,6 +293,8 @@ class InnerTask { std::vector>> parray_list; + std::unordered_map> parray_dependencies_map; + InnerTask(); InnerTask(long long int id, void *py_task); InnerTask(std::string name, long long int id, void *py_task); @@ -623,6 +625,10 @@ class InnerTask { void begin_multidev_req_addition(); void end_multidev_req_addition(); + std::vector& get_parray_dependencies(uint64_t parray_parent_id) { + return this->parray_dependencies_map[parray_parent_id]; + } + PlacementRequirementCollections &get_placement_req_options() { return placement_req_options_; } diff --git a/src/c/backend/phases.cpp b/src/c/backend/phases.cpp index 0bcb091a..2713e21f 100644 --- a/src/c/backend/phases.cpp +++ b/src/c/backend/phases.cpp @@ -237,6 +237,89 @@ void MemoryReserver::create_datamove_tasks(InnerTask *task) { task->add_dependencies(data_tasks, true); } + + + +void MemoryReserver::create_datamove_tasks2(InnerTask *task) { + // Get a list of the parrays the current task holds. + const std::vector>> + &parray_list = task->parray_list; + std::string task_base_name = task->get_name(); + std::vector data_tasks; + data_tasks.reserve(parray_list.size()); + + for (size_t i = 0; i < parray_list.size(); ++i) { + for (size_t j = 0; j < parray_list[i].size(); ++j) { + // Create a data movement task for each PArray. + parray::InnerPArray *parray = parray_list[i][j].first; + AccessMode access_mode = parray_list[i][j].second; + InnerDataTask *datamove_task = new InnerDataTask( + // TODO(hc): id should be updated! + task_base_name + ".dm." + std::to_string(i), 0, parray, access_mode, + i); + uint64_t parray_parent_id = parray->get_parent_parray()->id; + // Find dependency intersection between compute and data movement tasks. + + // TODO(hc): This is not the complete implementation. + // We will use a concurrent map for parray's + // task list as an optimization. + + std::vector compute_task_dependencies = task->get_dependencies(); + std::vector data_task_dependencies; + for (size_t k = 0; k < compute_task_dependencies.size(); ++k) { + InnerTask *parray_dependency = + static_cast(compute_task_dependencies[k]); + std::vector& dep_parray_dependencies = + parray_dependency->get_parray_dependencies(parray_parent_id); + + std::cout << parray_dependency->name << " is being traversed\n"; + for (size_t t = 0; t < dep_parray_dependencies.size(); ++t) { + data_task_dependencies.push_back(parray_dependency); + // If the current processing parray's access mode is READ ONLY, + // add this dependency as a dependency for this parray. + std::cout << "access mode:" << int(access_mode) << "\n"; + if (access_mode == AccessMode::IN) { + std::cout << "IN parray is added:" << parray_parent_id << "\n"; + task->get_parray_dependencies(parray_parent_id).push_back(parray_dependency); + } + } + } + + // If the current processing parray's access mode is not READ ONLY, + // add itself as a dependency for this parray. + std::cout << task->name << " is being traversed access id :" << int(access_mode) << "\n"; + if (access_mode != AccessMode::IN) { + std::cout << "IN/OUT OUT parray is added:" << parray_parent_id << "\n"; + task->get_parray_dependencies(parray_parent_id).push_back(task); + } + + // TODO(hc): pass false to add_dependencies() as optimization. + datamove_task->add_dependencies(data_task_dependencies, true); + // Copy assigned devices to a compute task to a data movement task. + // TODO(hc): When we support xpy, it should be devices corresponding + // to placements of the local partition. + auto device = task->get_assigned_devices()[i]; + datamove_task->add_assigned_device(device); + + datamove_task->device_constraints.emplace( + std::piecewise_construct, + std::forward_as_tuple(device->get_global_id()), + std::forward_as_tuple(0, 0, 1)); + + data_tasks.push_back(datamove_task); + // Add the created data movement task to a reserved task queue. + this->scheduler->increase_num_active_tasks(); + this->reserved_tasks_buffer.push_back(datamove_task); + } + } + + // Create dependencies between data move task and compute tasks. + task->add_dependencies(data_tasks, true); +} + + + + void MemoryReserver::run(SchedulerPhase *next_phase) { NVTX_RANGE("MemoryReserver::run", NVTX_COLOR_LIGHT_GREEN) @@ -263,7 +346,7 @@ void MemoryReserver::run(SchedulerPhase *next_phase) { if (can_reserve) { this->reserve_resources(task); this->reservable_tasks->pop(); - this->create_datamove_tasks(task); + this->create_datamove_tasks2(task); this->reserved_tasks_buffer.push_back(task); } else { // TODO:(wlr) we need some break condition to allow the scheduler to diff --git a/src/dev.patch b/src/dev.patch new file mode 100644 index 00000000..1789036a --- /dev/null +++ b/src/dev.patch @@ -0,0 +1,2498 @@ +diff --git a/src/c/backend/include/memory_manager.hpp b/src/c/backend/include/memory_manager.hpp +new file mode 100644 +index 0000000..b1467ec +--- /dev/null ++++ b/src/c/backend/include/memory_manager.hpp +@@ -0,0 +1,420 @@ ++#ifndef PARLA_MEMORY_MANNGER_HPP ++#define PARLA_MEMORY_MANNGER_HPP ++ ++#include "device_manager.hpp" ++#include "parray.hpp" ++ ++ ++/** ++ * @brief Node type of a PArray eviction double-linked list. ++ */ ++class PArrayNode { ++public: ++ PArrayNode(parray::InnerPArray *parr, size_t prior = 0) : ++ parray(parr), priority(prior), next(nullptr), prev(nullptr) ++ {} ++ ++ /// Pointer of a PArray instance ++ parray::InnerPArray *parray; ++ /// Priority of the node ++ /// TODO(hc): This is not used, but keep it for the future ++ size_t priority; ++ /// Pointers to the next and the previous PArrayNodes ++ PArrayNode *next; ++ PArrayNode *prev; ++}; ++ ++/** ++ * @brief Double-linked list of candidate PArrays for eviction. ++ * @details PArray eviction manager selects and evicts PArray instances ++ * in this list depending on an eviction policy. ++ * Note that an eviction manager manages this list for each device. ++ */ ++class DoubleLinkedList { ++public: ++ ++ /** ++ * @brief Print the current list. ++ */ ++ void print() { ++ PArrayNode *node = this->head_; ++ std::cout << "\n"; ++ while (node != nullptr) { ++ std::cout << node->parray->id << " -> \n"; ++ node = node->next; ++ } ++ std::cout << "\n"; ++ } ++ ++ /** ++ * @brief Append a PArray node to the list. ++ * @detail The first PArray of the list is set to both head and tail, and ++ * the last added PArray is set to tail. ++ * ++ * @param node PArray node to be appended ++ */ ++ void append(PArrayNode *node) { ++ this->mtx_.lock(); ++ if (this->list_size_ == 0) { ++ this->head_ = node; ++ this->tail_ = node; ++ } else { ++ this->tail_->next = node; ++ node->prev = this->tail_; ++ this->tail_ = node; ++ } ++ this->list_size_ += 1; ++ this->mtx_.unlock(); ++ } ++ ++ /** ++ * @brief Insert a PArray node between `node` and `node->next`. ++ * ++ * @param node existing PArray node where `new_node` is being linked ++ * @param new_node PArray node to be appended after `node` ++ */ ++ void insert_after(PArrayNode *node, PArrayNode *new_node) { ++ this->mtx_.lock(); ++ if (node->next != nullptr) { ++ node->next->prev = new_node; ++ new_node->next = node->next; ++ } else { ++ this->tail_ = new_node; ++ } ++ node->next = new_node; ++ new_node->prev = node; ++ this->mtx_.unlock(); ++ } ++ ++ /** ++ * @brief Insert a PArray node between `node` and `node->prev`. ++ * ++ * @param node existing PArray node where `new_node` is being linked ++ * @param new_node PArray node to be appended before `node` ++ */ ++ void insert_before(PArrayNode *node, PArrayNode *new_node) { ++ this->mtx_.lock(); ++ if (node->prev != nullptr) { ++ node->prev->next = new_node; ++ new_node->prev = node->prev; ++ } else { ++ this->head_ = new_node; ++ } ++ node->prev = new_node; ++ new_node->next = node; ++ this->mtx_.unlock(); ++ } ++ ++ /** ++ * @brief Remove and return the current head PArray node from a list. ++ */ ++ PArrayNode *remove_head() { ++ this->mtx_.lock(); ++ PArrayNode *old_head = this->head_; ++ if (old_head != nullptr) { ++ this->remove_unsafe(old_head); ++ } ++ this->mtx_.unlock(); ++ return old_head; ++ } ++ ++ /** ++ * @brief Remove a node and return true if it is removed false otherwise. ++ * ++ * @param node PArray node to be removed from a list ++ */ ++ bool remove(PArrayNode *node) { ++ this->mtx_.lock(); ++ bool rv = this->remove_unsafe(node); ++ this->mtx_.unlock(); ++ return rv; ++ } ++ ++ /** ++ * @brief Remove a node and return true if it is removed false otherwise. ++ * This function is not thread safe. ++ * ++ * @param node PArray node to be removed from a list ++ */ ++ bool remove_unsafe(PArrayNode *node) { ++ if (node->prev == nullptr && node->next == nullptr && ++ node != this->head_ && node != this->tail_) { ++ // If a node is not in a list, do nothing and return false. ++ return false; ++ } ++ ++ if (this->list_size_ == 1) { ++ // A node is a single node in a list. ++ this->head_ = this->tail_ = nullptr; ++ } else { ++ if (this->head_ == node) { ++ // A node is a head, and so break link of node->next->prev. ++ this->head_ = node->next; ++ node->next->prev = nullptr; ++ } else if (this->tail_ == node) { ++ // A node is a tail, and so break link of node->prev->next. ++ this->tail_ = node->prev; ++ node->prev->next = nullptr; ++ } else { ++ // A node is in the middle of a list, and so break two links. ++ node->prev->next = node->next; ++ node->next->prev = node->prev; ++ } ++ } ++ node->prev = node->next = nullptr; ++ this->list_size_ -= 1; ++ return true; ++ } ++ ++ /** ++ * @brief Return a size of a list. ++ */ ++ size_t size() { ++ this->mtx_.lock(); ++ size_t list_size = this->list_size_; ++ this->mtx_.unlock(); ++ return list_size; ++ } ++ ++ /** ++ * @brief Return the current head. ++ * This function is not thread safe. ++ */ ++ PArrayNode *get_head() { ++ return this->head_; ++ } ++ ++ /** ++ * @brief Return the current tail. ++ * This function is not thread safe. ++ */ ++ PArrayNode *get_tail() { ++ return this->tail_; ++ } ++ ++private: ++ PArrayNode *head_{nullptr}; ++ PArrayNode *tail_{nullptr}; ++ std::mutex mtx_; ++ size_t list_size_{0}; ++}; ++ ++ ++/** ++ * @brief Least-recently-used policy based eviction manager for a device. ++ * @details It holds PArrays which are not referenced by tasks which are ++ * between task mapping and termination phases. ++ */ ++class LRUDeviceEvictionManager { ++public: ++ struct PArrayMetaInfo { ++ // Points to a PArray node if it exists ++ PArrayNode *parray_node_ptr; ++ // The number of references to a PArray ++ size_t ref_count; ++ }; ++ ++ LRUDeviceEvictionManager(DevID_t dev_id) : dev_id_(dev_id) {} ++ ++ /** ++ * @brief A task refers `parray` in the device. ++ * @detail This function is called when a task being mapped ++ * refers `parray`. This increases a reference count of the PArray ++ * and removes it from a zero-referenced list if it exists. ++ * ++ * @param parray pointer to a parray to be referred by a task ++ */ ++ void grab_parray_reference(parray::InnerPArray *parray) { ++ this->mtx_.lock(); ++ uint64_t parray_id = parray->id; ++ auto found = this->parray_reference_counts_.find(parray_id); ++ if (found == this->parray_reference_counts_.end()) { ++ // Add `parray` to a zr list if it does not exist. ++ PArrayNode *parray_node = new PArrayNode(parray); ++ this->parray_reference_counts_[parray_id] = ++ PArrayMetaInfo{parray_node, 1}; ++ } else { ++ // If `parray` is already in a zr list, removes it ++ // from the list and increases its reference count. ++ found->second.ref_count++; ++ this->zr_parray_list_.remove(found->second.parray_node_ptr); ++ } ++ this->mtx_.unlock(); ++ } ++ ++ /** ++ * @brief A task is finished and releases `parray` in the device. ++ * @detail This function is called by a worker thread when a task ++ * assigned to that thread is completed. The thread releases the ++ * `parray` instance, and decreases its reference count in the device. ++ * If the reference count becomes 0, the `parray` is added to ++ * the zero-referenced list. ++ * ++ * @param parray pointer to a parray to be released by a task ++ */ ++ void release_parray_reference(parray::InnerPArray *parray) { ++ this->mtx_.lock(); ++ uint64_t parray_id = parray->id; ++ auto found = this->parray_reference_counts_.find(parray_id); ++ if (found != this->parray_reference_counts_.end()) { ++ found->second.ref_count--; ++ if (found->second.ref_count == 0) { ++ // If none of task referes to `parray`, add it to ++ // a zr list. ++ this->zr_parray_list_.append(found->second.parray_node_ptr); ++ } ++ } ++ this->mtx_.unlock(); ++ } ++ ++ ++ /** ++ * @brief Return a size of a list. ++ */ ++ size_t size() { ++ size_t zr_parray_list_size{0}; ++ this->mtx_.lock(); ++ zr_parray_list_size = zr_parray_list_.size(); ++ this->mtx_.unlock(); ++ return zr_parray_list_size; ++ } ++ ++ /** ++ * @brief Remove and return a head of the zero-referenced list. ++ * @detail This function is not thread safe since it assumes that only ++ * the scheduler thread calls into this function during eviction. ++ */ ++ PArrayNode *remove_and_return_head_from_zrlist() { ++ PArrayNode* old_head{nullptr}; ++ this->mtx_.lock(); ++ old_head = this->zr_parray_list_.remove_head(); ++ this->mtx_.unlock(); ++ return old_head; ++ } ++ ++ /** ++ * @brief This function clears all existing PArrays in the ++ * zero-referenced list. ++ * @detail This function has two purposes. ++ * First, it is used to fix unlinked Python and C++ PArray ++ * instances. It is possible that Python PArrays are destroyed ++ * due to, for example, out-of-scope. Then, C++ PArrays ++ * start to hold invalid Python PArray pointers. ++ * When a scheduler starts PArray eviction, it is possible that ++ * the C++ PArrays holding invalid Python PArrays are chosen ++ * as evictable PArrays and causes segmentation fault. ++ * This function removes those PArrays in advance to avoid ++ * this issue (But users should be aware of and take care of this scenario). ++ * The second purpose is to allow users to clear all memory ++ * related states managed by the Parla runtime. ++ */ ++ // TODO(hc): This bulk flushing is not ideal IMO. The Parla runtime ++ // should provide a function that flushes only a single PArray. ++ // I am postponing this work since we need to take care of ++ // the zero-referenced list, but I have higher priorities. ++ void clear_all_instances() { ++ this->mtx_.lock(); ++ PArrayNode* head{nullptr}; ++ do { ++ head = this->zr_parray_list_.remove_head(); ++ } while (head != nullptr); ++ this->mtx_.unlock(); ++ } ++ ++private: ++ /// This eviction manager manages PArray instances in this device ++ DevID_t dev_id_; ++ std::mutex mtx_; ++ /// Key: PArray ID, Value: Meta information including reference ++ /// count of a PArray ++ std::unordered_map parray_reference_counts_; ++ /// A list of zero-referenced PArrays. ++ DoubleLinkedList zr_parray_list_; ++}; ++ ++ ++/** ++ * @brief Least-recently-used policy based global eviction manager. ++ * @details External components access and manipulate PArray instances in any ++ * device through this manager. ++ */ ++class LRUGlobalEvictionManager { ++public: ++ LRUGlobalEvictionManager(DeviceManager *device_manager) : ++ device_manager_(device_manager) { ++ this->device_mm_.resize( ++ device_manager->template get_num_devices()); ++ for (size_t i = 0; i < this->device_mm_.size(); ++i) { ++ this->device_mm_[i] = new LRUDeviceEvictionManager(i); ++ } ++ } ++ ++ /** ++ * @brief A task refers `parray` in `dev_id` device. ++ * @detail This function is called when a task being mapped ++ * refers `parray`. This increases a reference count of the PArray ++ * and removes it from a zero-referenced list if it exists. ++ * ++ * @param parray pointer to a parray to be referred by a task ++ * @param dev_id device id of a device to access its information ++ */ ++ void grab_parray_reference(parray::InnerPArray *parray, DevID_t dev_id) { ++ this->device_mm_[dev_id]->grab_parray_reference(parray); ++ } ++ ++ /** ++ * @brief A task is finished and releases `parray` in `dev_id` device. ++ * @detail This function is called by a worker thread when a task ++ * assigned to that completes. So, the thread also releases a ++ * `parray`, and decreases a reference count of that in the device. ++ * If the reference count becomes 0, the `parray` is added to ++ * its zero-referenced list. ++ * ++ * @param parray pointer to a parray to be released by a task ++ * @param dev_id device id of a device to access its information ++ */ ++ void release_parray_reference(parray::InnerPArray *parray, DevID_t dev_id) { ++ this->device_mm_[dev_id]->release_parray_reference(parray); ++ } ++ ++ /** ++ * @brief Return a size of a list. ++ * ++ * @param dev_id device id of a device to access its information ++ */ ++ size_t size(DevID_t dev_id) { ++ return this->device_mm_[dev_id]->size(); ++ } ++ ++ /** ++ * @brief Remove and return a head of the zero-referenced list. ++ * @detail This function is not thread safe since it assumes that only ++ * the scheduler thread calls into this function during eviction. ++ * ++ * @param dev_id device id of a device to access its information ++ */ ++ void *remove_and_return_head_from_zrlist(DevID_t dev_id) { ++ PArrayNode *old_head = ++ this->device_mm_[dev_id]->remove_and_return_head_from_zrlist(); ++ void *py_parray{nullptr}; ++ if (old_head != nullptr) { ++ parray::InnerPArray *c_parray = old_head->parray; ++ py_parray = c_parray->get_py_parray(); ++ } ++ return py_parray; ++ } ++ ++ void clear_all_instances() { ++ for (size_t i = 0; i < device_mm_.size(); ++i) { ++ device_mm_[i]->clear_all_instances(); ++ } ++ } ++ ++private: ++ /// Device manager managing system environment ++ DeviceManager *device_manager_; ++ /// A list of LRU-based eviction manager for each device ++ std::vector device_mm_; ++}; ++ ++#endif +diff --git a/src/c/backend/include/parray.hpp b/src/c/backend/include/parray.hpp +index 68d3de6..aa6fa56 100644 +--- a/src/c/backend/include/parray.hpp ++++ b/src/c/backend/include/parray.hpp +@@ -30,8 +30,8 @@ enum AccessMode { + INOUT = 2, + /// Output of a task. + OUT = 3, +- /// Removed PArray (false everywhere). +- REMOVED = 4, ++ /// Freed PArray (false everywhere). ++ FREED = 4, + /// Deleted PArray (removed from table). + DELETED = 5, + }; +@@ -59,7 +59,7 @@ public: + /// compared to restructuring overheads), but this counter is decreased. + /// This is used to provide more accurate PArray placement information + /// to the task mapping step. +- std::vector> num_active_tasks; ++ std::vector> num_referring_tasks; + + InnerPArray() = delete; + InnerPArray(void *, uint64_t, uint64_t, InnerPArray *, PArrayState *, +@@ -92,14 +92,14 @@ public: + /// Add a pointer of the task that will use this PArray to the task list + void add_task(InnerTask *task); + +- /// Increase the counter for the active tasks that use this PArray. +- void incr_num_active_tasks(DevID_t global_dev_id); ++ /// Increase the number of the tasks referring this PArray. ++ void incr_num_referring_tasks(DevID_t global_dev_id); + +- /// Decrease the counter for the active tasks that use this PArray. +- void decr_num_active_tasks(DevID_t global_dev_id); ++ /// Decrease the number of the tasks referring this PArray. ++ void decr_num_referring_tasks(DevID_t global_dev_id); + +- /// Get the number of counter for the active tasks that use this PArray. +- size_t get_num_active_tasks(DevID_t global_dev_id); ++ /// Get the number of the tasks referring this PArray. ++ size_t get_num_referring_tasks(DevID_t global_dev_id); + + // TODO(hc): I will replace this list with a concurrent map. + /// Get a reference to a list of tasks who are using this PArray +@@ -136,8 +136,8 @@ private: + // I will use this map: https://github.com/greg7mdp/parallel-hashmap + // I have used this for a while and it is good. + TaskList _task_lists; +- ++ /// Python PArray address + void *_py_parray; + }; + +-} // namespace parray +\ No newline at end of file ++} // namespace parray +diff --git a/src/c/backend/include/parray_tracker.hpp b/src/c/backend/include/parray_tracker.hpp +index b49c62f..c46141b 100644 +--- a/src/c/backend/include/parray_tracker.hpp ++++ b/src/c/backend/include/parray_tracker.hpp +@@ -13,6 +13,10 @@ + + using namespace parray; + ++/** ++ * @brief PArray tracker that tracks PArray mapping state. ++ * Note that this does not track slice PArrays, but a complete PArray. ++ */ + class PArrayTracker { + public: + PArrayTracker(size_t num_devices) : num_devices_(num_devices) { +@@ -204,7 +208,7 @@ protected: + + /// Any worker thread can update states of PArrays. + /// Guard operations by this lock. +- /// TODO(hc): This will be replaced with parallel hash map. ++ /// TODO(hc): This will be replaced with parallel hash map (phmap). + std::mutex mtx; + }; + +diff --git a/src/c/backend/include/runtime.hpp b/src/c/backend/include/runtime.hpp +index f38cfac..17f72b3 100644 +--- a/src/c/backend/include/runtime.hpp ++++ b/src/c/backend/include/runtime.hpp +@@ -44,6 +44,7 @@ + #include "parray_tracker.hpp" + #include "profiling.hpp" + #include "resource_requirements.hpp" ++#include "memory_manager.hpp" + + using namespace std::chrono_literals; + using namespace parray; +@@ -985,9 +986,26 @@ public: + /* Should Run, Stop Condition */ + std::atomic should_run = true; + ++ /* Clear all Python PArray references if users request */ ++ std::atomic clear_all_pyparrays = false; ++ /* Clear all C PArray objects if users request in eviction manager */ ++ std::atomic clear_all_cparrays = false; ++ + /* Phase: maps tasks to devices */ + Mapper *mapper; + ++ /* If it is set, break an infinite loop in InnerScheduler::run() ++ and invoke PArray eviction from PythonScheduler */ ++ bool break_for_eviction = false; ++ /* Memory size to evict for each device */ ++ std::vector memory_size_to_evict{0}; ++ ++ /* Set necessary memory size getting from eviction manager ++ on each device */ ++ void set_memory_size_to_evict(size_t, DevID_t); ++ /* Get memory size to evict for each device */ ++ size_t get_memory_size_to_evict(DevID_t); ++ + /* Phase reserves resources to limit/plan task execution*/ + MemoryReserver *memory_reserver; + RuntimeReserver *runtime_reserver; +@@ -995,7 +1013,7 @@ public: + /*Responsible for launching a task. Signals worker thread*/ + Launcher *launcher; + +- InnerScheduler(DeviceManager *device_manager); ++ InnerScheduler(LRUGlobalEvictionManager *memory_manager, DeviceManager *device_manager); + ~InnerScheduler(); + // InnerScheduler(int nworkers); + +@@ -1017,6 +1035,9 @@ public: + /* Set Python "stop" callback */ + void set_stop_callback(stopfunc_t stop_callback); + ++ /* Get a flag that represents if there is still a task to be executed */ ++ bool get_should_run(); ++ + /* Run the scheduler thread. Active for the lifetime of the Parla program */ + void run(); + +@@ -1087,6 +1108,20 @@ public: + /* Release a PArray in a device */ + void remove_parray(InnerPArray *parray, DevID_t global_dev_id); + ++ /* Reflect a PArray removal to PArray trackers */ ++ void remove_parray_from_tracker( ++ parray::InnerPArray *parray, DevID_t global_dev_id); ++ ++ void grab_parray_reference( ++ parray::InnerPArray *parray, DevID_t global_dev_id) { ++ this->mm_->grab_parray_reference(parray, global_dev_id); ++ } ++ ++ void release_parray_reference( ++ parray::InnerPArray *parray, DevID_t global_dev_id) { ++ this->mm_->release_parray_reference(parray, global_dev_id); ++ } ++ + /* Get mapped memory on device */ + size_t get_mapped_memory(DevID_t global_dev_id); + +@@ -1108,10 +1143,17 @@ public: + + DeviceManager *get_device_manager() { return this->device_manager_; } + ++ /* Invoke clearing all C PArray instances from a PArray eviction manager. */ ++ void invoke_all_cparrays_clear(); ++ ++ /* Get a flag that represents whether Python PArrays should be cleared */ ++ bool get_all_pyparrays_clear_flag(); + protected: + /// It manages all device instances in C++. + /// This is destructed by the Cython scheduler. + DeviceManager *device_manager_; ++ ++ LRUGlobalEvictionManager *mm_; + }; + + #endif // PARLA_BACKEND_HPP +diff --git a/src/c/backend/memory_manager.cpp b/src/c/backend/memory_manager.cpp +new file mode 100644 +index 0000000..f829294 +--- /dev/null ++++ b/src/c/backend/memory_manager.cpp +@@ -0,0 +1 @@ ++#include "include/memory_manager.hpp" +diff --git a/src/c/backend/parray.cpp b/src/c/backend/parray.cpp +index 5e555d4..20cb46a 100644 +--- a/src/c/backend/parray.cpp ++++ b/src/c/backend/parray.cpp +@@ -9,7 +9,7 @@ InnerPArray::InnerPArray(void *py_parray, uint64_t id, uint64_t parent_id, + DevID_t num_devices) + : _py_parray(py_parray), id(id), parent_id(parent_id), + _parent_parray(parent_parray), _state(state), _num_devices(num_devices) { +- num_active_tasks.resize(num_devices); ++ num_referring_tasks.resize(num_devices); + } + + const uint64_t InnerPArray::get_size() const { return this->_size; } +@@ -29,32 +29,32 @@ void InnerPArray::add_task(InnerTask *task) { + this->_task_lists.push_back(task); + } + +-void InnerPArray::incr_num_active_tasks(DevID_t global_dev_id) { ++void InnerPArray::incr_num_referring_tasks(DevID_t global_dev_id) { + if (this->_parent_parray != nullptr) { +- this->_parent_parray->num_active_tasks[global_dev_id].fetch_add( ++ this->_parent_parray->num_referring_tasks[global_dev_id].fetch_add( + 1, std::memory_order_relaxed); + } else { +- this->num_active_tasks[global_dev_id].fetch_add(1, ++ this->num_referring_tasks[global_dev_id].fetch_add(1, + std::memory_order_relaxed); + } + } + +-void InnerPArray::decr_num_active_tasks(DevID_t global_dev_id) { ++void InnerPArray::decr_num_referring_tasks(DevID_t global_dev_id) { + if (this->_parent_parray != nullptr) { +- this->_parent_parray->num_active_tasks[global_dev_id].fetch_sub( ++ this->_parent_parray->num_referring_tasks[global_dev_id].fetch_sub( + 1, std::memory_order_relaxed); + } else { +- this->num_active_tasks[global_dev_id].fetch_sub(1, ++ this->num_referring_tasks[global_dev_id].fetch_sub(1, + std::memory_order_relaxed); + } + } + +-size_t InnerPArray::get_num_active_tasks(DevID_t global_dev_id) { ++size_t InnerPArray::get_num_referring_tasks(DevID_t global_dev_id) { + if (this->_parent_parray != nullptr) { +- return this->_parent_parray->num_active_tasks[global_dev_id].load( ++ return this->_parent_parray->num_referring_tasks[global_dev_id].load( + std::memory_order_relaxed); + } else { +- return this->num_active_tasks[global_dev_id].load( ++ return this->num_referring_tasks[global_dev_id].load( + std::memory_order_relaxed); + } + } +diff --git a/src/c/backend/parray_tracker.cpp b/src/c/backend/parray_tracker.cpp +index ac5e88d..e5226d7 100644 +--- a/src/c/backend/parray_tracker.cpp ++++ b/src/c/backend/parray_tracker.cpp +@@ -12,7 +12,6 @@ size_t PArrayTracker::do_parray_creation_(AccessMode access_mode, + if (access_mode >= AccessMode::OUT || is_tracked) { + return 0; + } else { +- + size_t to_move = parray->get_size(); + + // std::cout << "PArrayTracker::do_parray_creation " << std::endl; +@@ -65,8 +64,8 @@ size_t PArrayTracker::do_parray_removal_(AccessMode access_mode, DevID_t dev_id, + this->set_parray_unsafe(i, parray_id, false); + } + +- if (access_mode != AccessMode::REMOVED) { +- this->set_parray_unsafe(dev_id, parray_id, false); ++ if (access_mode != AccessMode::FREED) { ++ this->set_parray_unsafe(dev_id, parray_id, true); + } + + if (is_slice) { +@@ -92,7 +91,6 @@ size_t PArrayTracker::do_parray_removal_(AccessMode access_mode, DevID_t dev_id, + } + } + */ +- + } else { + // If the PArray is not a slice, it is a parent + // invalidate all its children on all devices except the target +@@ -104,7 +102,7 @@ size_t PArrayTracker::do_parray_removal_(AccessMode access_mode, DevID_t dev_id, + this->set_parray_unsafe(i, child_id, false); + } + +- if (access_mode != AccessMode::REMOVED) { ++ if (access_mode != AccessMode::FREED) { + this->set_parray_unsafe(dev_id, child_id, false); + } + } +@@ -217,4 +215,4 @@ size_t PArrayTracker::check_log(const DevID_t dev_id, + // std::cout << "Checked parray removal: " << to_move << std::endl; + + return to_move; +-} +\ No newline at end of file ++} +diff --git a/src/c/backend/phases.cpp b/src/c/backend/phases.cpp +index 79eef62..6975c82 100644 +--- a/src/c/backend/phases.cpp ++++ b/src/c/backend/phases.cpp +@@ -93,7 +93,7 @@ void Mapper::map_task(InnerTask *task, DeviceRequirementList &chosen_devices) { + mapped_pool.increase(mapped_size); + + InnerPArray *parray = parray_access.first; +- parray->incr_num_active_tasks(global_dev_id); ++ parray->incr_num_referring_tasks(global_dev_id); + } + } + +@@ -132,6 +132,14 @@ void Mapper::run(SchedulerPhase *next_phase) { + this->drain_parray_buffer(); + + // TODO Fix Issue #108 ++ /** XXX(hc): revisit this ++ // Accumulate necessary memory size for PArrays being mapped. ++ std::vector accum_necessary_memory; ++ // Eviction manager does not evict CPU instances; In the future, ++ // this also might be evicted to other devices. But to simplify indexing, ++ // allocate as the number of total devices. ++ accum_necessary_memory.resize(this->device_manager->get_num_devices(DeviceType::All)); ++ ***/ + while (has_task && num_task_mapping_attempt < 20) { + + this->drain_parray_buffer(); +@@ -277,6 +285,9 @@ bool MemoryReserver::check_data_resources(InnerTask *task) { + InnerPArray *parray = parray_access.first; + AccessMode access_mode = parray_access.second; + ++ // Register this PArray to eviction manager's table ++ this->scheduler->grab_parray_reference(parray, local_device_idx); ++ + // If the PArray is not an input, then we don't need to check size + // Note(@dialecticDolt): + // There is literally no such thing as an out type in our syntax why do we +@@ -305,6 +316,10 @@ bool MemoryReserver::check_data_resources(InnerTask *task) { + + status = status && device_status; + if (!status) { ++ // If a device has not enough memory, activate eviction manager ++ this->scheduler->set_memory_size_to_evict( ++ size_on_device * 2, local_device_idx); ++ this->scheduler->break_for_eviction = true; + break; + } + } +diff --git a/src/c/backend/policy.cpp b/src/c/backend/policy.cpp +index 1cf1c76..89a85fb 100644 +--- a/src/c/backend/policy.cpp ++++ b/src/c/backend/policy.cpp +@@ -12,6 +12,8 @@ bool LocalityLoadBalancingMappingPolicy::calc_score_devplacement( + // std::cout << "[Locality-aware- and Load-balancing mapping policy]\n"; + + // Check device resource availability. ++ // Note that this counter is tracking 'launched' or 'reserved' resource ++ // status. + if (!device.check_resource_availability(dev_placement_req.get())) { + // std::cout << "Device resource failure!" << std::endl; + return false; +diff --git a/src/c/backend/scheduler.cpp b/src/c/backend/scheduler.cpp +index a6fadfc..95152aa 100644 +--- a/src/c/backend/scheduler.cpp ++++ b/src/c/backend/scheduler.cpp +@@ -126,8 +126,18 @@ template class WorkerPool; + + // Scheduler Implementation + +-InnerScheduler::InnerScheduler(DeviceManager *device_manager) +- : device_manager_(device_manager) { ++InnerScheduler::InnerScheduler(LRUGlobalEvictionManager* memory_manager, ++ DeviceManager *device_manager) ++ : device_manager_(device_manager), mm_(memory_manager) { ++ ++ // For now, it does not evict PArrays on CPU memory. ++ this->memory_size_to_evict.resize( ++ device_manager->template get_num_devices()); ++ ++ // A dummy task count is used to keep the scheduler alive. ++ // NOTE: At least one task must be added to the scheduler by the main thread, ++ // otherwise the runtime will finish immediately ++ // this->increase_num_active_tasks(); + + this->workers.set_num_workers(1); + +@@ -157,31 +167,70 @@ void InnerScheduler::set_stop_callback(stopfunc_t stop_callback) { + this->stop_callback = stop_callback; + } + ++bool InnerScheduler::get_should_run() { ++ return this->should_run.load(); ++} ++ ++bool InnerScheduler::get_all_pyparrays_clear_flag() { ++ return this->clear_all_pyparrays.load(); ++} ++ ++void InnerScheduler::set_memory_size_to_evict( ++ size_t size, DevID_t dev_id) { ++ this->memory_size_to_evict[dev_id] = size; ++} ++ ++size_t InnerScheduler::get_memory_size_to_evict(DevID_t dev_id) { ++ return this->memory_size_to_evict[dev_id]; ++} ++ + void InnerScheduler::run() { + NVTX_RANGE("Scheduler::run", NVTX_COLOR_RED) +- unsigned long long iteration_count = 0; ++ this->clear_all_cparrays = false; ++ this->clear_all_pyparrays = false; + while (this->should_run.load()) { ++ this->break_for_eviction = false; + auto status = this->activate(); + if (this->sleep_flag) { + std::this_thread::sleep_for(std::chrono::milliseconds(this->sleep_time)); + } ++ if (this->break_for_eviction) { ++ // Yield a control to a Python scheduler to evict PArrays since ++ // PArray coherency protocol is managed at there. ++ break; ++ } ++ if (this->clear_all_cparrays.load()) { ++ // TODO(hc): This should be more generalized and refined. ++ // Temporarily use it as experimental puprose. ++ std::cout << "Clear all C/Python parrays..\n"; ++ this->mm_->clear_all_instances(); ++ this->clear_all_pyparrays = true; ++ break; ++ } + } + } + + void InnerScheduler::stop() { + LOG_INFO(SCHEDULER, "Stopping scheduler"); + this->should_run = false; +- launch_stop_callback(this->stop_callback, this->py_scheduler); ++ // XXX(hc): To process PArray eviction on Python, ++ // Python scheduler now has an while loop that iterates until there is ++ // no more task, and it wraps C scheduler's loop. ++ // Therefore, there is no point for C++ scheduler to explicitly invoke ++ // this callback at here. Python scheduler knows when it needs to stop. ++ //launch_stop_callback(this->stop_callback, this->py_scheduler); + LOG_INFO(SCHEDULER, "Stopped scheduler"); + } + ++void InnerScheduler::invoke_all_cparrays_clear() { ++ this->clear_all_cparrays = true; ++} ++ + Scheduler::Status InnerScheduler::activate() { + // std::cout<< "Scheduler Activated" << std::endl; +- + this->mapper->run(this->memory_reserver); + this->memory_reserver->run(this->runtime_reserver); + this->runtime_reserver->run(this->launcher); +- + // LOG_TRACE(SCHEDULER, "ReadyPhase Status: {}", this->runtime_reserver); + return this->status; + } +@@ -289,6 +338,15 @@ void InnerScheduler::remove_parray(InnerPArray *parray, DevID_t global_dev_id) { + // Could also be to call DELETED or REMOVED status on do_log + } + ++void InnerScheduler::remove_parray_from_tracker( ++ parray::InnerPArray *parray, DevID_t global_dev_id) { ++ AccessMode access_mode = AccessMode::FREED; ++ this->mapper->get_parray_tracker()->do_log(global_dev_id, ++ std::make_pair(parray, access_mode)); ++ this->memory_reserver->get_parray_tracker()->do_log(global_dev_id, ++ std::make_pair(parray, access_mode)); ++} ++ + size_t InnerScheduler::get_mapped_memory(DevID_t global_dev_idx) { + Device *device = + this->device_manager_->get_device_by_global_id(global_dev_idx); +@@ -362,7 +420,12 @@ void InnerScheduler::task_cleanup_postsync(InnerWorker *worker, InnerTask *task, + for (size_t j = 0; j < parray_access_list.size(); ++j) { + auto &parray_access = parray_access_list[j]; + InnerPArray *parray = parray_access.first; +- parray->decr_num_active_tasks(dev_id); ++ parray->decr_num_referring_tasks(dev_id); ++ // Decrease this PArray's reference count. ++ // If this becomes 0, this instance will be release ++ // when the PArray coherency protocol updates it ++ // to eviction state. ++ this->release_parray_reference(parray, dev_id); + } + } + +diff --git a/src/python/parla/__init__.py b/src/python/parla/__init__.py +index ba53b56..49ca47a 100644 +--- a/src/python/parla/__init__.py ++++ b/src/python/parla/__init__.py +@@ -7,6 +7,7 @@ from .cython import core + from .cython import device_manager + from .cython import device + from .cython import variants ++from .cython import mm + from .common.spawn import spawn + from .common import parray + +@@ -22,6 +23,7 @@ TaskSpace = tasks.TaskSpace + Tasks = tasks.TaskCollection + + DeviceManager = device_manager.PyDeviceManager ++PyMM = mm.PyMM + + Stream = device.Stream + create_env = tasks.create_env +@@ -59,6 +61,7 @@ class Parla: + self.sig = sig_type + self.handle_interrupt = True + self._device_manager = DeviceManager(dev_config_file) ++ self._memory_manager = PyMM(self._device_manager) + + if logfile is None: + logfile = os.environ.get("PARLA_LOGFILE", None) +@@ -77,7 +80,9 @@ class Parla: + if hasattr(self, "_sched"): + raise ValueError( + "Do not use the same Parla object more than once.") +- self._sched = self.scheduler_class(self._device_manager, **self.kwds) ++ self._sched = self.scheduler_class(self._memory_manager, ++ self._device_manager, ++ **self.kwds) + + self.interuppted = False + self.released = False +diff --git a/src/python/parla/common/parray/@ b/src/python/parla/common/parray/@ +new file mode 100644 +index 0000000..af841d1 +--- /dev/null ++++ b/src/python/parla/common/parray/@ +@@ -0,0 +1,541 @@ ++from __future__ import annotations ++from typing import TYPE_CHECKING, Union, List, Dict, Tuple, Any ++ ++import numpy ++import ctypes ++ ++#TODO: Fix this to be more stable and less of a hack. ++try: ++ import cupy ++except (ImportError, AttributeError): ++ import numpy as cupy ++ ++from .coherence import CPU_INDEX ++ ++if TYPE_CHECKING: # False at runtime ++ import cupy ++ ndarray = Union[numpy.ndarray, cupy.ndarray] ++ SlicesType = Union[slice, int, tuple] ++ IndicesMapType = List[Union[Dict[int, int], tuple]] ++ from parla.cython.cyparray_state import CyPArrayState ++ ++class MultiDeviceBuffer: ++ """Underlying Buffer of PArray. ++ ++ It holds per device array copy and also index mapping. ++ """ ++ ++ _buffer: Dict[int, ndarray | List[ndarray] | None] ++ shape: tuple ++ _indices_map: Dict[int, List[IndicesMapType] | None] ++ _cyparray_state: CyPArrayState ++ ++ def __init__(self, num_gpu: int, cyparray_state: CyPArrayState): ++ # per device buffer ++ # key: device_id ++ # val: single (complete) ndarray or list of (sub) ndarray ++ self._buffer = {n: None for n in range(num_gpu)} # add gpu id ++ self._buffer[CPU_INDEX] = None # add cpu id ++ ++ # per device indices mapping ++ # key: device_id ++ # val: list of {global_index: local_index} and tuple(begin, end, stop), and the tuple is a represent of slice(begin, end, stop) ++ self._indices_map = {n: None for n in range(num_gpu)} ++ self._indices_map[CPU_INDEX] = None ++ ++ # the shape of the complete array ++ self.shape = () ++ ++ self._cyparray_state = cyparray_state ++ ++ def nbytes_at(self, device_id:int) -> int: ++ """ ++ Return the buffer size at `device_id` ++ """ ++ buffer = self._buffer[device_id] ++ if buffer is None: ++ return 0 ++ elif isinstance(buffer, list): # subarray at this device buffer ++ # size is the sum ++ nbytes = 0 ++ for subarray in buffer: ++ nbytes += subarray.nbytes ++ return nbytes ++ else: # complete array ++ return buffer.nbytes ++ ++ def set_complete_array(self, array: ndarray) -> int: ++ """ ++ Add array into the buffer (based on array's device). ++ ++ Args: ++ array: :class:`cupy.ndarray` or :class:`numpy.array` object ++ ++ Return: ++ a location (device_id) of the array ++ """ ++ # get the array's location ++ if isinstance(array, numpy.ndarray): ++ location = CPU_INDEX ++ else: ++ location = int(array.device) ++ ++ self._buffer[location] = array ++ self.shape = array.shape ++ self._cyparray_state.set_exist_on_device(location, True) ++ return location ++ ++ def set(self, device_id: int, array: ndarray, is_complete: bool = True, overwrite: bool = False) -> None: ++ """ ++ Set copy at a device, also clean up existing `indices_map` if necessary ++ ++ Args: ++ device_id: gpu device_id or CPU_INDEX ++ array: :class:`cupy.ndarray` or :class:`numpy.array` object ++ is_complete: True if `array` is a complete copy, otherwise `array` is a subarray ++ overwrite: True if need to clean other subarray copy inside the device before assign the new array ++ """ ++ if is_complete: ++ self._indices_map[device_id] = None ++ self._buffer[device_id] = array ++ else: ++ if not isinstance(self._buffer[device_id], List) or overwrite: ++ self._indices_map[device_id] = None ++ self._buffer[device_id] = [array] ++ else: ++ self._buffer[device_id].append(array) ++ self._cyparray_state.set_exist_on_device(device_id, True) ++ ++ def get(self, device_id: int) -> ndarray | List[ndarray] | None: ++ """ ++ Return the copy at a device ++ ++ Args: ++ device_id: gpu device_id or CPU_INDEX ++ ++ Return ++ :class:`cupy.ndarray` or :class:`numpy.array` object ++ """ ++ return self._buffer[device_id] ++ ++ def get_global_slices(self, device_id:int, subarray_index:int) -> SlicesType | None: ++ """ ++ Return global slices of one copy at the device. ++ ++ If the copy is complete, return None ++ """ ++ if self._indices_map[device_id] is None: ++ return None ++ else: ++ slices = [] ++ for device_indices in self._indices_map[device_id][subarray_index]: ++ if isinstance(device_indices, dict): ++ index = list(device_indices.keys()) ++ if len(index) == 1: ++ slices.append(index[0]) ++ else: ++ slices.append(index) ++ else: ++ slices.append(slice(*device_indices)) ++ ++ return tuple(slices) ++ ++ @staticmethod ++ def _map_int_with_int_map(n: int, int_map: Dict[int, int]) -> int | None: ++ """ ++ Find the mapping of `n` in `int_map` ++ ++ if `n` not in `int_map`, return None ++ ++ example: ++ n: 2 ++ int_map: {1:0, 2:1} ++ return: 1 ++ """ ++ return None if n not in int_map else int_map[n] ++ ++ @staticmethod ++ def _map_int_with_slice(n: int, target_slice: tuple) -> int | None: ++ """ ++ Find the mapping of `n` in a `target_slice` (find index of `n` in `target_slice`) ++ `target_slice` is a tuple(begin, end, step) ++ ++ if `n` not in `target_slice`, return None ++ ++ example: ++ n: 2 ++ target_slice: (2, 4, 1) ++ return: 0 ++ """ ++ # TODO: assume slice is simple (no neg value) ++ begin, end, step = target_slice ++ step = 1 if step is None else step ++ ++ # bound checking ++ if n < begin or n >= end: ++ return None ++ if (n - begin) % step != 0: ++ return None ++ ++ return (n - begin) // step ++ ++ @staticmethod ++ def _map_slice_with_slice(input_slice: tuple, target_slice: tuple) -> tuple | None: ++ """ ++ Find the mapping of `input_slice` in a `target_slice` ++ `input_slice` and `target_slice` is a tuple(begin, end, step) ++ ++ if `input_slice` not a subset of `target_slice`, return None ++ ++ example: ++ input_slice: (2, 10, 4) ++ target_slice: (0, 10, 2) ++ return: (1, 5, 2) ++ """ ++ # TODO: assume slice is simple (no neg value) ++ target_begin, target_end, target_step = target_slice ++ target_step = 1 if target_step is None else target_step ++ ++ input_begin, input_end, input_step = input_slice ++ input_step = 1 if input_step is None else input_step ++ ++ mapped_begin = MultiDeviceBuffer._map_int_with_slice( ++ input_begin, target_slice) ++ ++ # get the last possible element in range of `input_slice` ++ # TODO: what if last_element < input_begin ? ++ last_element = input_end - input_step + (input_end - input_begin) % input_step ++ mapped_end = MultiDeviceBuffer._map_int_with_slice(last_element, target_slice) ++ ++ if mapped_begin is None or mapped_end is None: ++ return None ++ ++ # adjust step ++ if input_step % target_step != 0: ++ return None ++ mapped_step = input_step // target_step ++ ++ return mapped_begin, mapped_end + 1, mapped_step # tuple ++ ++ def map_local_slices(self, device_id: int, global_slices: SlicesType) -> (int, SlicesType): ++ """ ++ Map a given global slices to local slices wrt buffer at the device. ++ ++ Raise error if `global_slices` out of range ++ ++ Return subarray_index: the index of subarray in the list of `_buffer[device_id]` ++ local_slices: the local slices which maps to the `global_slices` ++ Note: this method assume a indices mapping exists for this device ++ """ ++ # indexing into the whole array, index of out bound ++ not_tuple = False ++ if not isinstance(global_slices, tuple): # if not a tuple, make it a tuple ++ global_slices = tuple([global_slices]) ++ not_tuple = True ++ ++ local_slices = [] ++ ++ if len(self.shape) < len(global_slices): ++ raise IndexError(f"index out of range, index:{global_slices}") ++ ++ final_subarray_index = 0 ++ ++ for subarray_index in range(len(self._indices_map[device_id])): # for each subarray at this device ++ indices_map = self._indices_map[device_id][subarray_index] ++ ++ for d in range(len(global_slices)): ++ size = self.shape[d] # number of entries at this axis ++ global_index = global_slices[d] ++ index_map = None if d >= len(indices_map) else indices_map[d] ++ ++ if index_map is None: # None means 1:1 map to all elements at this axis ++ local_index = global_index ++ elif isinstance(index_map, dict) and len(index_map) == 1: ++ # special case, this axis was indexed by a int, so ++ # dimension was reduced by 1, ++ # need to ignore this axis, just check index match or not ++ if list(index_map.keys())[0] == global_index: # false if type or value doesn't match ++ continue ++ else: ++ local_index = None ++ elif isinstance(index_map, tuple): ++ if isinstance(global_index, int): # int vs slice ++ local_index = MultiDeviceBuffer._map_int_with_slice(global_index, index_map) ++ elif isinstance(global_index, list): # List[int] vs slice ++ local_index = [MultiDeviceBuffer._map_int_with_slice(i, index_map) for i in global_index] ++ ++ # any index out of bound? ++ if None in local_index: ++ local_index = None ++ elif isinstance(global_index, slice): # slice vs slice ++ # slice to tuple ++ slice_tuple = global_index.indices(size) ++ local_tuple = MultiDeviceBuffer._map_slice_with_slice(slice_tuple, index_map) ++ if local_tuple is None: ++ local_index = None ++ else: ++ local_index = slice(*local_tuple) ++ else: ++ raise IndexError(f"Unsupported slices type: {type(global_index)}") ++ else: # Map is int or list ++ if isinstance(global_index, int): # int vs int/list ++ local_index = self._map_int_with_int_map(global_index, index_map) ++ elif isinstance(global_index, list): # list vs int/list ++ local_index = [self._map_int_with_int_map(i, index_map) for i in global_index] ++ ++ if None in local_index: ++ local_index = None ++ elif isinstance(global_index, slice): # slice vs int/list ++ # slice to tuple ++ slice_tuple = global_index.indices(size) ++ local_index = [self._map_int_with_int_map(i, index_map) for i in range(*slice_tuple)] ++ ++ if None in local_index: ++ local_index = None ++ else: ++ raise IndexError(f"Unsupported slices type {type(global_index)}") ++ ++ # if None, it means index out of range at this axis ++ if local_index is None: ++ # check next copy ++ local_slices = None ++ break ++ ++ local_slices.append(local_index) ++ ++ if local_slices is None: # result is not found for this subarray ++ if subarray_index == len(self._indices_map[device_id]) - 1: # this is the last subarray ++ local_slices = None # non slices is found ++ else: # check next subarray ++ local_slices = [] # clear intermidate result ++ else: ++ final_subarray_index = subarray_index ++ break ++ ++ if local_slices is None: ++ raise IndexError(f"index out of range, index:{global_slices}") ++ elif not_tuple: ++ if len(local_slices) == 0: # only be possible when special case int vs int exists and all axis are ignored ++ return final_subarray_index, slice(None, None, None) ++ else: ++ return final_subarray_index, local_slices[0] ++ else: ++ return final_subarray_index, tuple(local_slices) ++ ++ def set_slices_mapping(self, device_id: int, global_slices: SlicesType): ++ """ ++ set a global slices to local slices mapping wrt buffer at the device. ++ ++ Raise error if `global_slices` is higher dim than shape ++ Note: this call doesn't check slice is within range, if it is not in range ++ exception will be trigger later when trying to index into the copy ++ """ ++ if not isinstance(global_slices, tuple): # if not a tuple, make it a tuple ++ global_slices = tuple([global_slices]) ++ ++ if len(self.shape) < len(global_slices): ++ raise IndexError(f"index out of range, index:{global_slices}") ++ ++ slices_map_list = [] ++ for d in range(len(global_slices)): ++ size = self.shape[d] # number of entries at this axis ++ global_slice = global_slices[d] ++ ++ if isinstance(global_slice, int): # a single integer ++ slice_map = {global_slice: 0} ++ elif isinstance(global_slice, list): # a list of integer ++ slice_map = {global_slice[i]: i for i in range(len(global_slice))} ++ elif isinstance(global_slice, slice): # slice ++ # save slice as a tuple ++ # None in slice will be instantiated by concrete values ++ slice_map = global_slice.indices(size) ++ else: ++ raise IndexError(f"Unsupported slices type {type(global_slice)}") ++ slices_map_list.append(slice_map) ++ ++ if self._indices_map[device_id] is None: ++ self._indices_map[device_id] = [slices_map_list] ++ else: ++ self._indices_map[device_id].append(slices_map_list) ++ ++ def get_by_global_slices(self, device_id: int, global_slices: SlicesType): ++ """ ++ Indexing/Slicing the buffer by `global_slices`. ++ ++ `global_slices` will be first converted into local slices ++ ++ Args: ++ device_id: gpu device_id or CPU_INDEX ++ global_slices: slice/ints/tuple/list, use the same format as advance indexing of numpy ++ ++ Return ++ :class:`cupy.ndarray` or :class:`numpy.array` object or `None` if there is no copy at that device ++ """ ++ # check if a copy exists at this device ++ # ++ # This is needed for the usage of @spawn() annotation ++ # sometimes device has no copy but still need a parray slices view object in advance ++ # Example: ++ # A = parray(numpy_array) ++ # @spawn(inout=[A], placement=gpu(0)) ++ # def task1(): ++ # ... a task move A to GPU. free CPU copy ... ++ # @spawn(inout=[A[0]], placement=gpu(1)) ++ # def task2(): ++ # error: cpu's copy is freed -> `inout=[A[0]]` trigger an exception, ++ # since it try a slice A at CPU (spawn() itself is happened at outer CPU task) ++ if self._buffer[device_id] is None: ++ return None ++ ++ # check if there is a mapping ++ if self._indices_map[device_id] is None: ++ return self._buffer[device_id].__getitem__(global_slices) ++ else: ++ # map global slices to local slices ++ subarray_index, local_slices = self.map_local_slices(device_id, global_slices) ++ return self._buffer[device_id][subarray_index].__getitem__(local_slices) ++ ++ def set_by_global_slices(self, device_id: int, global_slices: SlicesType, value: ndarray | Any): ++ """ ++ Indexing/Slicing the buffer by `global_slices` and set value. ++ ++ `global_slices` will be first converted into local slices ++ ++ Args: ++ device_id: gpu device_id or CPU_INDEX ++ global_slices: slice/ints/tuple/list, use the same format as advance indexing of numpy ++ value: the data to set ++ ++ Return ++ :class:`cupy.ndarray` or :class:`numpy.array` object ++ """ ++ # check if there is a mapping ++ if self._indices_map[device_id] is None: ++ self._buffer[device_id].__setitem__(global_slices, value) ++ else: ++ # map global slices to local slices ++ subarray_index, local_slices = self.map_local_slices(device_id, global_slices) ++ self._buffer[device_id][subarray_index].__setitem__(local_slices, value) ++ ++ ++ def _move_data(self, copy_func, dst: int, src: int, subarray_index: int, dst_slices: SlicesType, src_slices: SlicesType, dst_is_current_device:bool = True): ++ """ ++ Helper function for copy_data_between_device ++ """ ++ if dst_is_current_device: ++ if dst_slices is None and src_slices is None: # Complete to Complete ++ self._buffer[dst] = copy_func(self._buffer[src]) ++ elif dst_slices is None and src_slices is not None: # Incomplete to Complete ++ self._buffer[dst][src_slices] = copy_func(self._buffer[src][subarray_index]) ++ elif dst_slices is not None and src_slices is None: # Complete to incomplete ++ if self._buffer[dst] is None: ++ self._buffer[dst] = [] ++ self._buffer[dst].append(copy_func(self._buffer[src][dst_slices])) ++ else: # incomplete to incomplete ++ raise ValueError("Copy from subarray to subarray is unsupported") ++ else: ++ with cupy.cuda.Device(dst): # switch device ++ if dst_slices is None and src_slices is None: # Complete to Complete ++ self._buffer[dst] = copy_func(self._buffer[src]) ++ elif dst_slices is None and src_slices is not None: # Incomplete to Complete ++ self._buffer[dst][src_slices] = copy_func(self._buffer[src][subarray_index]) ++ elif dst_slices is not None and src_slices is None: # Complete to incomplete ++ if self._buffer[dst] is None: ++ self._buffer[dst] = [] ++ self._buffer[dst].append(copy_func(self._buffer[src][dst_slices])) ++ else: # incomplete to incomplete ++ raise ValueError("Copy from subarray to subarray is unsupported") ++ ++ def copy_data_between_device(self, dst: int, src: int, dst_is_current_device: bool = True) -> None: ++ """ ++ Copy data from src to dst. ++ ++ dst is current device if `dst_is_current_device` is True ++ """ ++ # a function to copy data between GPU devices async ++ def copy_from_device_async(src): ++ dst_data = cupy.empty_like(src) ++ dst_data.data.copy_from_device_async(src.data, src.nbytes) ++ return dst_data ++ ++ if self._indices_map[src] is None: ++ src_slices_list = [None] ++ else: ++ src_slices_list = [self.get_global_slices(src, i) for i in range(len(self._indices_map[src]))] ++ ++ # TRICK: if there are multiple subarray in this device, always pick the last one ++ # this is because load of data always comes together with create indices mapping ++ # so the indices mapping will put at the end of self._indices_map ++ dst_slices = self.get_global_slices(dst, -1) ++ ++ for subarray_index in range(len(src_slices_list)): ++ src_slices = src_slices_list[subarray_index] ++ if src == CPU_INDEX: # copy from CPU to GPU ++ self._move_data(cupy.asarray, dst, src, subarray_index, dst_slices, src_slices, dst_is_current_device) ++ elif dst != CPU_INDEX: # copy from GPU to GPU ++ self._move_data(copy_from_device_async, dst, src, subarray_index, dst_slices, src_slices, dst_is_current_device) ++ else: # copy from GPU to CPU ++ self._move_data(cupy.asnumpy, dst, src, subarray_index, dst_slices, src_slices) # dst_is_current_device is no need if dst is CPU ++ self._cyparray_state.set_exist_on_device(dst, True) ++ ++ def get_slices_hash(self, global_slices: SlicesType) -> int: ++ """ ++ Get hash value of a slices of complete array. ++ ++ This could be done by replaing list and slice to tuple ++ """ ++ # little chance to have collision, but what if it happened? ++ hash_value = 17 # use a none zero hash value, so hash(0) != 0 ++ prime = 31 ++ if not isinstance(global_slices, tuple): ++ if isinstance(global_slices, list): ++ # Built-int hash() method might return negtive value. ++ # c_size_t is to ensure it is not negative ++ hash_value = hash_value * prime + ctypes.c_size_t(hash(tuple(global_slices))).value ++ elif isinstance(global_slices, slice): ++ hash_value = hash_value * prime + ctypes.c_size_t(hash(global_slices.indices(self.shape[0]))).value ++ else: ++ hash_value = hash_value * prime + ctypes.c_size_t(hash(global_slices)).value ++ else: ++ if len(self.shape) < len(global_slices): ++ raise IndexError(f"index out of range, index:{global_slices}") ++ ++ for d in range(len(global_slices)): ++ index = global_slices[d] ++ if isinstance(index, list): ++ hash_value = hash_value * prime + ctypes.c_size_t(hash(tuple(index))).value ++ elif isinstance(index, slice): ++ hash_value = hash_value * prime + ctypes.c_size_t(hash(index.indices(self.shape[d]))).value ++ else: ++ hash_value = hash_value * prime + ctypes.c_size_t(hash(index)).value ++ ++ return hash_value ++ ++ def __str__(self): ++ return str(self._buffer) ++ ++ def __contains__(self, device_id): ++ """ ++ Return True if there is a copy in this device ++ """ ++ return device_id in self._buffer and self._buffer[device_id] is not None ++ ++ def clear(self, device_id) -> None: ++ """ ++ Clear data in device_id ++ """ ++ import gc ++ mempool = cupy.get_default_memory_pool() ++ pinned_mempool = cupy.get_default_pinned_memory_pool() ++ print("inside evict():", mempool.used_bytes(), flush=True) ++ pritn("total:", mempool.total_bytes()) ++ print("device id:", device_id, flush=True) ++#print("inside pinned evict():", pinned_mempool.used_bytes(), flush=True) ++ print("evicting array size:", self._buffer[device_id].nbytes) ++ print("type:", type(self._buffer[device_id])) ++ del self._buffer[device_id] ++ print("After inside evict():", mempool.used_bytes(), flush=True) ++#print("After inside pinned evict():", pinned_mempool.used_bytes(), flush=True) ++ self._indices_map[device_id] = None ++ self._buffer[device_id] = None ++ self._cyparray_state.set_exist_on_device(device_id, False) +diff --git a/src/python/parla/common/parray/core.py b/src/python/parla/common/parray/core.py +index e25396c..8d89075 100644 +--- a/src/python/parla/common/parray/core.py ++++ b/src/python/parla/common/parray/core.py +@@ -120,6 +120,8 @@ class PArray: + # Note(@dialecticDolt):It should be valid to create PArrays outside of a scheduler context!! + # FIXME + ++ # Register this PArray to tracker and make a link between ++ # C PArray instance. + scheduler = get_scheduler() + if scheduler is None: + raise NotImplementedError( +@@ -129,6 +131,7 @@ class PArray: + num_devices = len(scheduler.device_manager.get_all_devices()) + self._cy_parray = CyPArray( + self, self.ID, self.parent_ID, self.parent, self._cyparray_state, num_devices) ++ # record the size in Cython PArray + self._cy_parray.set_size(self.subarray_nbytes) + + target_dev_id = - \ +@@ -301,7 +304,7 @@ class PArray: + f"Parent_ID: {self.parent_ID if self.ID != self.parent_ID else None}, " + f"Slice: {self._slices[0] if self.ID != self.parent_ID else None}, " + f"Bytes: {self.subarray_nbytes}, " +- f"Owner: {'GPU ' + str(self._coherence.owner) if self._coherence.owner != CPU_INDEX else 'CPU'}") ++ f"Owner: {'GPU ' + str(self._coherence.owner) if self._coherence.owner != CPU_INDEX else 'CPU'}", flush=True) + for device_id, state in self._coherence._local_states.items(): + if device_id == CPU_INDEX: + device_name = "CPU" +@@ -311,13 +314,13 @@ class PArray: + + if isinstance(state, dict): + print( +- f"state: {[state_str_map[s] for s in list(state.values())]}, including sliced copy: # states of slices is unordered wrt the below slices") ++ f"state: {[state_str_map[s] for s in list(state.values())]}, including sliced copy: # states of slices is unordered wrt the below slices", flush=True) + for slice, slice_id in zip(self._array._indices_map[device_id], range(len(self._array._indices_map[device_id]))): + print( +- f"\tslice {slice_id} - indices: {slice}, bytes: {self._array._buffer[device_id][slice_id].nbytes}") ++ f"\tslice {slice_id} - indices: {slice}, bytes: {self._array._buffer[device_id][slice_id].nbytes}", flush=True) + else: +- print(f"state: {state_str_map[state]}") +- print("---End of Overview") ++ print(f"state: {state_str_map[state]}", flush=True) ++ print("---End of Overview", flush=True) + + # slicing/indexing + +@@ -395,7 +398,7 @@ class PArray: + + with self._coherence_cv[device_id]: + operations = self._coherence.evict(device_id, keep_one_copy) +- if operations[0].inst == MemoryOperation.ERROR: ++ if len(operations) != 0 and operations[0].inst == MemoryOperation.ERROR: + return False # cannot perform the eviction + self._process_operations(operations) + +@@ -479,11 +482,21 @@ class PArray: + # f"Evicting {self.name} from {op.src}, size: {to_free} bytes", flush=True) + + scheduler = get_scheduler() +- if (to_free > 0) and (scheduler is not None): +- # This frees the memory on the device in the mapped and reserved pools +- scheduler.device_manager.free_memory(op.src, to_free) +- # TODO(wlr): This is only for explictly evicted PArrays. PArrays that fall out of scope need to be freed as well. +- ++ if scheduler is not None: ++ if to_free > 0: ++ # This frees the memory on the device in the mapped and reserved pools ++ scheduler.device_manager.free_memory(op.src, to_free) ++ # TODO(wlr): This is only for explictly evicted PArrays. PArrays that fall out of scope need to be freed as well. ++ src_global_dev_id = ++ scheduler.device_manager.parrayid_to_globalid(op.src) ++ if self._cy_parray.get_num_referring_tasks(src_global_dev_id) == 0: ++ # If none of active tasks refers this PArray, ++ # remove this PArray on the src device from ++ # the PArray tracker's table. ++ scheduler.remove_parray_from_tracker( ++ self._cy_parray, src_global_dev_id) ++ # decrement the reference counter, relying on GC to free the memor ++ self._array.clear(op.src) + elif op.inst == MemoryOperation.ERROR: + raise RuntimeError( + "PArray gets an error from coherence protocol") +@@ -906,5 +919,11 @@ class PArray: + def get_parray_parentid_from_cpp(self): + return self._cy_parray.get_parray_parentid() + +- def get_num_active_tasks(self, global_dev_id): +- return self._cy_parray.get_num_active_tasks(global_dev_id) ++ def get_num_referring_tasks(self, global_dev_id): ++ return self._cy_parray.get_num_referring_tasks(global_dev_id) ++ ++ def __del__(self): ++ # Users can explicitly call `del` over a Python PArray. ++ # In this case, detroy its array instance. ++ # TODO(hc): This code is not tested yet ++ self._array = None +diff --git a/src/python/parla/common/parray/memory.py b/src/python/parla/common/parray/memory.py +index 05cef02..89bf74c 100644 +--- a/src/python/parla/common/parray/memory.py ++++ b/src/python/parla/common/parray/memory.py +@@ -573,3 +573,9 @@ class MultiDeviceBuffer: + self._buffer[device_id] = None + self._cyparray_state.set_exist_on_device(device_id, False) + return to_free ++ ++ def __del__(self): ++ for i in range(0, len(self._buffer)): ++ self._cyparray_state.set_exist_on_device(i, False) ++ self._indices_map = None ++ self._buffer = None +diff --git a/src/python/parla/cython/CMakeLists.txt b/src/python/parla/cython/CMakeLists.txt +index 431fc58..7378df9 100644 +--- a/src/python/parla/cython/CMakeLists.txt ++++ b/src/python/parla/cython/CMakeLists.txt +@@ -14,6 +14,7 @@ add_cython_target(tasks tasks.pyx CXX PY3) + add_cython_target(scheduler scheduler.pyx CXX PY3) + add_cython_target(device device.pyx CXX PY3) + add_cython_target(device_manager device_manager.pyx CXX PY3) ++add_cython_target(mm mm.pyx CXX PY3) + add_cython_target(cyparray_state cyparray_state.pyx CXX PY3) + add_cython_target(cyparray cyparray.pyx CXX PY3) + add_cython_target(variants variants.pyx CXX PY3) +@@ -23,6 +24,7 @@ add_library(tasks MODULE ${tasks}) + add_library(scheduler MODULE ${scheduler}) + add_library(device MODULE ${device}) + add_library(device_manager MODULE ${device_manager}) ++add_library(mm MODULE ${mm}) + add_library(cyparray_state MODULE ${cyparray_state}) + add_library(cyparray MODULE ${cyparray}) + add_library(variants MODULE ${variants}) +@@ -37,6 +39,7 @@ install(TARGETS tasks LIBRARY DESTINATION ${PYTHON_RELATIVE_SITE_PACKAGES_DIR}/p + install(TARGETS scheduler LIBRARY DESTINATION ${PYTHON_RELATIVE_SITE_PACKAGES_DIR}/parla/cython/) + install(TARGETS device LIBRARY DESTINATION ${PYTHON_RELATIVE_SITE_PACKAGES_DIR}/parla/cython/) + install(TARGETS device_manager LIBRARY DESTINATION ${PYTHON_RELATIVE_SITE_PACKAGES_DIR}/parla/cython/) ++install(TARGETS mm LIBRARY DESTINATION ${PYTHON_RELATIVE_SITE_PACKAGES_DIR}/parla/cython/) + install(TARGETS cyparray_state LIBRARY DESTINATION ${PYTHON_RELATIVE_SITE_PACKAGES_DIR}/parla/cython/) + install(TARGETS cyparray LIBRARY DESTINATION ${PYTHON_RELATIVE_SITE_PACKAGES_DIR}/parla/cython/) + install(TARGETS variants LIBRARY DESTINATION ${PYTHON_RELATIVE_SITE_PACKAGES_DIR}/parla/cython/) +@@ -89,6 +92,14 @@ target_include_directories(device_manager PUBLIC ${NumPy_INCLUDE_DIRS}) + target_include_directories(device_manager PUBLIC ${PYTHON_INCLUDE_DIRS}) + target_include_directories(device_manager PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) + ++target_link_libraries(mm ${PYTHON_LIBRARIES}) ++target_link_libraries(mm backend) ++target_include_directories(mm PUBLIC ${PROJECT_SOURCE_DIR}/src/c/backend) ++target_include_directories(mm PUBLIC ${PROJECT_SOURCE_DIR}/src/c/backend/include) ++target_include_directories(mm PUBLIC ${NumPy_INCLUDE_DIRS}) ++target_include_directories(mm PUBLIC ${PYTHON_INCLUDE_DIRS}) ++target_include_directories(mm PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) ++ + target_link_libraries(cyparray_state ${PYTHON_LIBRARIES}) + target_link_libraries(cyparray_state backend) + target_include_directories(cyparray_state PUBLIC ${PROJECT_SOURCE_DIR}/src/c/backend) +@@ -119,6 +130,7 @@ python_extension_module(tasks) + python_extension_module(scheduler) + python_extension_module(device) + python_extension_module(device_manager) ++python_extension_module(mm) + python_extension_module(cyparray_state) + python_extension_module(cyparray) + python_extension_module(variants) +diff --git a/src/python/parla/cython/core.pxd b/src/python/parla/cython/core.pxd +index 9643d9f..76a3f75 100644 +--- a/src/python/parla/cython/core.pxd ++++ b/src/python/parla/cython/core.pxd +@@ -4,6 +4,7 @@ cimport cython + from parla.cython.device_manager cimport DeviceManager + from parla.cython.device cimport Device, CyDevice + from parla.cython.cyparray cimport InnerPArray ++from parla.cython.mm cimport LRUGlobalEvictionManager + + from libc.stdint cimport uint32_t, uint64_t, int64_t + from libcpp cimport bool +@@ -22,6 +23,7 @@ cdef extern from "include/gpu_utility.hpp" nogil: + void gpu_busy_sleep(const int device, const unsigned long cycles, + uintptr_t stream_ptr) + ++ + cdef extern from "include/runtime.hpp" nogil: + ctypedef void (*launchfunc_t)(void* py_scheduler, void* py_task, void* py_worker) + ctypedef void (*stopfunc_t)(void*) +@@ -130,15 +132,19 @@ cdef extern from "include/runtime.hpp" nogil: + + bool should_run + +- InnerScheduler(DeviceManager* cpp_device_manager) ++ InnerScheduler(LRUGlobalEvictionManager* cpp_memory_manager, DeviceManager* cpp_device_manager) + + void set_num_workers(int num_workers) + void set_py_scheduler(void* py_scheduler) + void set_stop_callback(stopfunc_t func) + ++ bool get_should_run() ++ + void run() except + + void stop() + ++ long long int get_memory_size_to_evict(int dev_id) except + ++ + void activate_wrapper() + + void spawn_task(InnerTask* task) +@@ -149,7 +155,6 @@ cdef extern from "include/runtime.hpp" nogil: + void task_cleanup_presync(InnerWorker* worker, InnerTask* task, int state) except + + void task_cleanup_postsync(InnerWorker* worker, InnerTask* task, int state) except + + +- int get_num_active_tasks() + void increase_num_active_tasks() + void decrease_num_active_tasks() + +@@ -168,7 +173,10 @@ cdef extern from "include/runtime.hpp" nogil: + void spawn_wait() + + void create_parray(InnerPArray* parray, int parray_dev_id) ++ void remove_parray_from_tracker(InnerPArray* parray, int dev_id) + ++ void invoke_all_cparrays_clear() ++ bool get_all_pyparrays_clear_flag() + + + cdef extern from "include/profiling.hpp" nogil: +diff --git a/src/python/parla/cython/core.pyx b/src/python/parla/cython/core.pyx +index d2ac9f9..68891c5 100644 +--- a/src/python/parla/cython/core.pyx ++++ b/src/python/parla/cython/core.pyx +@@ -12,6 +12,7 @@ from parla.common.globals import AccessMode + from parla.cython.device cimport Device + from parla.cython.cyparray cimport CyPArray + from parla.cython.device_manager cimport CyDeviceManager, DeviceManager ++from parla.cython.mm cimport CyMM + import threading + from enum import IntEnum, auto + from parla.common.globals import cupy +@@ -585,14 +586,15 @@ cdef class PyInnerWorker: + cdef class PyInnerScheduler: + cdef InnerScheduler* inner_scheduler + +- def __cinit__(self, CyDeviceManager cy_device_manager, int num_workers, float vcus, object python_scheduler): ++ def __cinit__(self, CyMM cy_memory_manager, CyDeviceManager cy_device_manager, int num_workers, float vcus, object python_scheduler): + cdef InnerScheduler* _inner_scheduler + cdef DeviceManager* _cpp_device_manager = cy_device_manager.get_cpp_device_manager() ++ cdef LRUGlobalEvictionManager* _cpp_memory_manager = cy_memory_manager.get_cpp_memory_manager() + +- _inner_scheduler = new InnerScheduler(_cpp_device_manager) ++ _inner_scheduler = new InnerScheduler(_cpp_memory_manager, _cpp_device_manager) + self.inner_scheduler = _inner_scheduler + +- def __init__(self, CyDeviceManager cy_device_manager, int num_workers, float vcus, object python_scheduler): ++ def __init__(self, CyMM cy_memory_manager, CyDeviceManager cy_device_manager, int num_workers, float vcus, object python_scheduler): + cdef InnerScheduler* _inner_scheduler + _inner_scheduler = self.inner_scheduler + +@@ -610,6 +612,14 @@ cdef class PyInnerScheduler: + def __dealloc__(self): + del self.inner_scheduler + ++ cpdef get_should_run(self): ++ """ ++ This function checks whether there are remaining tasks ++ in C scheduler queues. ++ """ ++ cdef InnerScheduler* c_self = self.inner_scheduler ++ return c_self.get_should_run() ++ + cpdef run(self): + cdef InnerScheduler* c_self = self.inner_scheduler + with nogil: +@@ -661,10 +671,6 @@ cdef class PyInnerScheduler: + with nogil: + c_self.task_cleanup_postsync(c_worker, c_task, state) + +- cpdef get_num_active_tasks(self): +- cdef InnerScheduler* c_self = self.inner_scheduler +- return c_self.get_num_active_tasks() +- + cpdef increase_num_active_tasks(self): + cdef InnerScheduler* c_self = self.inner_scheduler + c_self.increase_num_active_tasks() +@@ -720,6 +726,18 @@ cdef class PyInnerScheduler: + cdef InnerScheduler* c_self = self.inner_scheduler + return c_self.get_reserved_memory(global_dev_id) + ++ cpdef get_memory_size_to_evict(self, int global_dev_id): ++ cdef InnerScheduler* c_self = self.inner_scheduler ++ return c_self.get_memory_size_to_evict(global_dev_id) ++ ++ cpdef invoke_all_cparrays_clear(self): ++ cdef InnerScheduler* c_self = self.inner_scheduler ++ c_self.invoke_all_cparrays_clear() ++ ++ cpdef get_all_pyparrays_clear_flag(self): ++ cdef InnerScheduler* c_self = self.inner_scheduler ++ return c_self.get_all_pyparrays_clear_flag() ++ + + class Resources: + +diff --git a/src/python/parla/cython/cyparray.pxd b/src/python/parla/cython/cyparray.pxd +index 73223bc..06e7422 100644 +--- a/src/python/parla/cython/cyparray.pxd ++++ b/src/python/parla/cython/cyparray.pxd +@@ -12,12 +12,12 @@ cdef extern from "include/parray.hpp" namespace "parray": + cdef cppclass InnerPArray: + InnerPArray(void *, uint64_t, uint64_t, InnerPArray *, PArrayState *i, uint32_t) except + + void set_size(uint64_t) +- uint64_t get_num_active_tasks(uint32_t global_dev_id) except + ++ uint64_t get_num_referring_tasks(uint32_t global_dev_id) except + + const uint64_t get_parent_id() except + + + cdef class CyPArray: + # Hold a C++ instance which we're wrapping + cdef InnerPArray* cpp_parray + cdef InnerPArray* get_cpp_parray(self) +- cpdef get_num_active_tasks(self, int global_dev_id) ++ cpdef get_num_referring_tasks(self, int global_dev_id) + cpdef get_parray_parentid(self) +diff --git a/src/python/parla/cython/cyparray.pyx b/src/python/parla/cython/cyparray.pyx +index 19e9fee..ad13d6a 100644 +--- a/src/python/parla/cython/cyparray.pyx ++++ b/src/python/parla/cython/cyparray.pyx +@@ -37,8 +37,8 @@ cdef class CyPArray: + cdef InnerPArray* get_cpp_parray(self): + return self.cpp_parray + +- cpdef get_num_active_tasks(self, int global_dev_id): +- return self.cpp_parray.get_num_active_tasks(global_dev_id) ++ cpdef get_num_referring_tasks(self, int global_dev_id): ++ return self.cpp_parray.get_num_referring_tasks(global_dev_id) + + cpdef get_parray_parentid(self): + return self.cpp_parray.get_parent_id() +diff --git a/src/python/parla/cython/mm.pxd b/src/python/parla/cython/mm.pxd +new file mode 100644 +index 0000000..8a8670a +--- /dev/null ++++ b/src/python/parla/cython/mm.pxd +@@ -0,0 +1,13 @@ ++from parla.cython.device_manager cimport DeviceManager ++ ++cdef extern from "include/memory_manager.hpp" nogil: ++ cdef cppclass LRUGlobalEvictionManager: ++ LRUGlobalEvictionManager(DeviceManager *) ++ unsigned long long size(unsigned int device_id) ++ void *remove_and_return_head_from_zrlist(unsigned int device_id) ++ ++cdef class CyMM: ++ cdef LRUGlobalEvictionManager* _inner_mm ++ cpdef size(self, int dev_id) ++ cpdef remove_and_return_head_from_zrlist(self, int dev_id) ++ cdef LRUGlobalEvictionManager* get_cpp_memory_manager(self) +diff --git a/src/python/parla/cython/mm.pyx b/src/python/parla/cython/mm.pyx +new file mode 100644 +index 0000000..268e7cf +--- /dev/null ++++ b/src/python/parla/cython/mm.pyx +@@ -0,0 +1,62 @@ ++ ++from parla.cython import device_manager ++ ++from parla.cython.core cimport LRUGlobalEvictionManager ++from parla.cython cimport device_manager ++#from parla.cython.core import LRUGlobalEvictionManager ++ ++class PyMM: ++ def __init__(self, dm: device_manager.PyDeviceManager): ++ self._device_manager = device_manager ++ self._cy_mm = CyMM(dm.get_cy_device_manager()) ++ ++ def size(self, dev_id: int): ++ return self._cy_mm.size(dev_id) ++ ++ def remove_and_return_head_from_zrlist(self, dev_id: int): ++ return self._cy_mm.remove_and_return_head_from_zrlist(dev_id) ++ ++ def get_cy_memory_manager(self): ++ return self._cy_mm ++ ++ def print_memory_stats(self, device_id, label: str): ++ import psutil ++ import os ++ print(f"[{label}] Memory tracking", flush=True) ++ try: ++ import cupy ++ mempool = cupy.get_default_memory_pool() ++ pinned_mempool = cupy.get_default_pinned_memory_pool() ++ print(( ++ f"\t GPU{device_id} {label} CuPy used bytes: {mempool.used_bytes()} \n" ++ f"\t GPU{device_id} {label} Free bytes: {mempool.free_bytes()} \n" ++ f"\t GPU{device_id} {label} Total bytes: {mempool.total_bytes()} \n"), flush=True) ++ except ImportError: ++ print("MM tracker only supports CuPy memory status checking.", flush=True) ++ ++ ++cdef class CyMM: ++ ++ def __cinit__(self, device_manager.CyDeviceManager cy_dm): ++ self._inner_mm = new LRUGlobalEvictionManager(cy_dm.get_cpp_device_manager()) ++ ++ def __dealloc__(self): ++ del self._inner_mm ++ ++ cpdef size(self, int dev_id): ++ cdef LRUGlobalEvictionManager* c_self = self._inner_mm ++ return c_self.size(dev_id) ++ ++ cpdef remove_and_return_head_from_zrlist(self, int dev_id): ++ cdef LRUGlobalEvictionManager* c_self = self._inner_mm ++ cdef void* py_parray = c_self.remove_and_return_head_from_zrlist(dev_id) ++ if py_parray == NULL: ++ # TODO(hc): This path is actually not used. ++ # It would be great if we can check if this python object is valid ++ # at here; it can simplify our current mechanism a lot. ++ return None ++ else: ++ return py_parray ++ ++ cdef LRUGlobalEvictionManager* get_cpp_memory_manager(self): ++ return self._inner_mm +diff --git a/src/python/parla/cython/scheduler.pyx b/src/python/parla/cython/scheduler.pyx +index 3390240..db2caae 100644 +--- a/src/python/parla/cython/scheduler.pyx ++++ b/src/python/parla/cython/scheduler.pyx +@@ -21,6 +21,7 @@ from parla.cython import tasks + cimport core + from parla.cython import core + from parla.cython.cyparray import CyPArray ++from parla.cython.mm import PyMM + + from parla.common.globals import _Locals as Locals + from parla.common.globals import USE_PYTHON_RUNAHEAD, _global_data_tasks, PREINIT_THREADS +@@ -231,11 +232,11 @@ class WorkerThread(ControllableThread, SchedulerContext): + #print("Setting environment for task", active_task, flush=True) + active_task.environment = device_context + +- + #Writes all 'default' streams and event pointers to c++ task + #This allows their synchronization without the GIL and faster iteration over them + #(only saves initial runtime ones, TODO(wlr): save any user added events or streams after body returns) + device_context.write_to_task(active_task) ++ + #print("Wrote enviornment to task", active_task, flush=True) + + #handle event wait in python +@@ -294,6 +295,7 @@ class WorkerThread(ControllableThread, SchedulerContext): + elif isinstance(final_state, tasks.TaskRunahead): + core.binlog_2("Worker", "Runahead task: ", active_task.inner_task, " on worker: ", self.inner_worker) + ++ #TODO(wlr): Add better exception handling + #print("Cleaning up Task", active_task, flush=True) + + if USE_PYTHON_RUNAHEAD: +@@ -314,13 +316,13 @@ class WorkerThread(ControllableThread, SchedulerContext): + if isinstance(final_state, tasks.TaskRunahead): + final_state = tasks.TaskCompleted(final_state.return_value) + active_task.cleanup() +- + core.binlog_2("Worker", "Completed task: ", active_task.inner_task, " on worker: ", self.inner_worker) + + # print("Finished Task", active_task, flush=True) + active_task.state = final_state + self.task = None + ++ self.task = None + nvtx.pop_range(domain="Python Runtime") + elif self._should_run: + raise WorkerThreadException("%r Worker: Woke without a task", self.index) +@@ -350,7 +352,7 @@ class WorkerThread(ControllableThread, SchedulerContext): + + class Scheduler(ControllableThread, SchedulerContext): + +- def __init__(self, device_manager, n_threads=6, period=0.001): ++ def __init__(self, memory_manager, device_manager, n_threads=6, period=0.001): + super().__init__() + + self.start_monitor = threading.Condition(threading.Lock()) +@@ -364,9 +366,28 @@ class Scheduler(ControllableThread, SchedulerContext): + #TODO: Handle resources better + resources = 1.0 + ++ self.memory_manager = memory_manager + self.device_manager = device_manager ++ cy_memory_manager = self.memory_manager.get_cy_memory_manager() + cy_device_manager = self.device_manager.get_cy_device_manager() +- self.inner_scheduler = PyInnerScheduler(cy_device_manager, n_threads, resources, self) ++ self.inner_scheduler = PyInnerScheduler(cy_memory_manager, ++ cy_device_manager, ++ n_threads, ++ resources, self) ++ # This holds PArray references. ++ # Through this, it makes a scheduler control a PArray's life cycle. ++ # For example, this holds the last reference of a PArray and so, ++ # scheduler (or memory manager) can have the control of thats ++ # deallocation. ++ # TODO(hc): However, for now, we only support reset this dictionary ++ # not deallocating a single PArray. ++ # For now, instead we deallocate PArray instance by ++ # removing the internal array reference through evict(). ++ # In the future, we will have better design for this. ++ self.active_parrays = {} ++ # Worker threads and a scheduler both can access the active_parrays ++ # and so we need a lock to guard that. ++ self.active_parrays_monitor = threading.Condition(threading.Lock()) + + self.worker_threads = [WorkerThread(self, i) for i in range(n_threads)] + +@@ -382,6 +403,58 @@ class Scheduler(ControllableThread, SchedulerContext): + def scheduler(self): + return self + ++ def append_active_parray(self, parray: PArray): ++ """ Append a PArray reference. ++ ++ :param parray: PArray to be appended ++ """ ++ with self.active_parrays_monitor: ++ self.active_parrays[parray.ID] = parray ++ ++ def remove_active_parray(self, parray: PArray): ++ """ Remove a PArray reference. ++ ++ :param parray: PArray to be removed ++ """ ++ with self.active_parrays_monitor: ++ self.active_parrays[parray.ID] = None ++ ++ def clear_active_parrays(self): ++ """ Clear all references from active_parrays dictionary. ++ """ ++ with self.active_parrays_monitor: ++ import cupy ++ for k in range(0, 4): ++ with cupy.cuda.Device(k): ++ mempool = cupy.get_default_memory_pool() ++ print(f"\t {k} Used: {mempool.used_bytes()}, Free: {mempool.free_bytes()}", flush=True) ++ # TODO(hc): This is unstable hack. ++ # This loop is necessary since a worker thread ++ # can proceed with the next task graph ++ # (in case of benchmark/python/benchmark.py) ++ # and generate and allocate new data while a scheduler ++ # who is the caller of this function holds its reference. ++ # It seems like there is a concurrency issue; for example, ++ # removing active_parrays' reference is not immediately ++ # caught by a gc. The safest way that I am using is to evict ++ # each of PArrays at here. ++ # But still it is unstable way and I will replace this soon. ++ for k, v in self.active_parrays.items(): ++ for dev in self.device_manager.get_all_devices(): ++ global_dev_id = dev.get_global_id() ++ parray_dev_id = self.device_manager.globalid_to_parrayid(global_dev_id) ++ v.evict(parray_dev_id) ++ # TODO(hc): I am not sure why the above loop is necessary.. ++ # But otherwise, it doesnt deallocate cupy arrays. ++ # TODO(hc): This is not considering concurrent execution between ++ # a thread and a scheduler.. ++ # The main thread can allocate new data while or before the scheduler ++ # deallocates the old PArrays through this. ++ for k in range(0, 4): ++ with cupy.cuda.Device(k): ++ mempool = cupy.get_default_memory_pool() ++ print(f"\t After {k} Used: {mempool.used_bytes()}, Free: {mempool.free_bytes()}", flush=True) ++ + def get_device_reqs_from_placement(self, placement, vcus, memory): + return self.device_manager.get_device_reqs_from_placement(placement, vcus, memory) + +@@ -414,10 +487,73 @@ class Scheduler(ControllableThread, SchedulerContext): + pass + #print("Runtime Stopped", flush=True) + ++ def parray_eviction(self): ++ py_mm = self.memory_manager ++ print("Eviction policy is activated") ++ for cuda_device in self.device_manager.get_devices(DeviceType.CUDA): ++ global_id = cuda_device.get_global_id() ++ parray_id = self.device_manager.globalid_to_parrayid(global_id) ++ # Get target memory size to evict from this device ++ memory_size_to_evict = \ ++ self.inner_scheduler.get_memory_size_to_evict(global_id) ++ # Get the number of PArray candidates that are allowed to be evicted ++ # from Python eviction manager. ++ num_evictable_parray = py_mm.size(global_id) ++ # TODO(hc): remove this. this is for test. ++ import cupy ++ for i in range(0, num_evictable_parray): ++ try: ++ # Get a PArray from a memory manager to evict. ++ evictable_parray = \ ++ py_mm.remove_and_return_head_from_zrlist(global_id) ++ if evictable_parray is not None: ++ # TODO(hc): remove this. this is for test. ++ for k in range(0, 4): ++ with cupy.cuda.Device(k): ++ mempool = cupy.get_default_memory_pool() ++ print(f"\t OK? {k} Used GPU{k}: {mempool.used_bytes()}, Free Mmeory: {mempool.free_bytes()}", flush=True) ++ ++ evictable_parray.evict(parray_id) ++ ++ # TODO(hc): remove this. this is for test. ++ for k in range(0, 4): ++ with cupy.cuda.Device(k): ++ mempool = cupy.get_default_memory_pool() ++ print(f"\t OK {k} Used GPU{k}: {mempool.used_bytes()}, Free Mmeory: {mempool.free_bytes()}", flush=True) ++ ++ # Repeat eviction until it gets enough memory. ++ memory_size_to_evict -= \ ++ evictable_parray.nbytes_at(parray_id) ++ if memory_size_to_evict <= 0: ++ break ++ except Exception as e: ++ print("Failed to find parray evictable", flush=True) ++ return ++ + def run(self): +- #print("Scheduler: Running", flush=True) +- self.inner_scheduler.run() +- #print("Scheduler: Stopped Loop", flush=True) ++ with self: ++ while True: ++ print("Scheduler: Running", flush=True) ++ self.inner_scheduler.run() ++ if self.inner_scheduler.get_all_pyparrays_clear_flag(): ++ # All the references of the PArrays held by ++ # a Python scheduler should be destroyed ++ # AFTER C++ scheduler (or memory manager) clears ++ # all PArray nodes from the list in the eviction manager. ++ # TODO(hc): rename the function to is_cparrays_cleared. ++ # TODO(hc): this might be done by a worker thread ++ # who will allocate PArrays. ++ # Otherwise, there should be thread concurrent ++ # over memory allocation/deallocation. ++ self.clear_active_parrays() ++ else: ++ should_run = self.inner_scheduler.get_should_run() ++ if should_run == False: ++ break ++ # This case is executed if PArray eviction ++ # mechanism was invoked by C++ scheduler. ++ self.parray_eviction() ++ self.stop_callback() + + def stop(self): + #print("Scheduler: Stopping (Called from Python)", flush=True) +@@ -437,7 +573,6 @@ class Scheduler(ControllableThread, SchedulerContext): + def spawn_task(self, task): + #print("Scheduler: Spawning Task", task, flush=True) + self.inner_scheduler.spawn_task(task.inner_task) +- + + def assign_task(self, task, worker): + task.state = tasks.TaskRunning(task.func, task.args, task.dependencies) +@@ -449,7 +584,6 @@ class Scheduler(ControllableThread, SchedulerContext): + def spawn_wait(self): + self.inner_scheduler.spawn_wait() + +- + def create_parray(self, cy_parray: CyPArray, parray_dev_id: int): + """ + Reserve PArray instances that are created through +@@ -514,6 +648,20 @@ class Scheduler(ControllableThread, SchedulerContext): + return self.inner_scheduler.get_reserved_parray_state( \ + global_dev_id, parray_parent_id) + ++ def remove_parray_from_tracker(\ ++ self, cy_parray: CyPArray, did: int): ++ """ ++ Remove the evicted PArray instance on device `global_dev_id` ++ from the PArray tracker's table ++ ++ :param cy_parray: Cython PArray instance to be removed ++ :param did: global logical device id where the PArray is evicted ++ """ ++ self.inner_scheduler.remove_parray_from_tracker(cy_parray, did) ++ ++ def invoke_all_cparrays_clear(self): ++ self.inner_scheduler.invoke_all_cparrays_clear() ++ + + def _task_callback(task, body): + """ +diff --git a/src/python/parla/cython/tasks.pyx b/src/python/parla/cython/tasks.pyx +index 98bd759..bd71e44 100644 +--- a/src/python/parla/cython/tasks.pyx ++++ b/src/python/parla/cython/tasks.pyx +@@ -21,6 +21,8 @@ from parla.common.globals import AccessMode, Storage + from parla.cython.cyparray import CyPArray + from parla.common.parray.core import PArray + from parla.common.globals import SynchronizationType as SyncType ++from parla.common.globals import _global_data_tasks ++ + + PyDevice = device.PyDevice + PyCUDADevice = device.PyCUDADevice +@@ -554,18 +556,25 @@ class Task: + cy_parray = in_parray.cy_parray + self.inner_task.add_parray(cy_parray, + AccessMode.IN, in_parray_devid) ++ # Add a PArray reference to a dictionary in a scheduler ++ # to pass its lifecycle. ++ self.scheduler.append_active_parray(in_parray) + for out_parray_tpl in dataflow.output: + out_parray = out_parray_tpl[0] + out_parray_devid = out_parray_tpl[1] + cy_parray = out_parray.cy_parray + self.inner_task.add_parray(cy_parray, + AccessMode.OUT, out_parray_devid) ++ self.scheduler.append_active_parray(out_parray) + for inout_parray_tpl in dataflow.inout: + inout_parray = inout_parray_tpl[0] + inout_parray_devid = inout_parray_tpl[1] + cy_parray = inout_parray.cy_parray + self.inner_task.add_parray(cy_parray, + AccessMode.INOUT, inout_parray_devid) ++ # TODO(hc): Maybe we can pass dataflow to reduce ++ # lock conflicts. ++ self.scheduler.append_active_parray(inout_parray) + + def notify_dependents_wrapper(self): + """! +@@ -713,6 +722,7 @@ class DataMovementTask(Task): + idx=0, state=TaskCreated(), scheduler=None, name=None): + super().__init__(taskspace, idx, state, scheduler, name) + self.parray = parray ++ + self.access_mode = access_mode + self.assigned_devices = assigned_devices + +@@ -748,7 +758,6 @@ class DataMovementTask(Task): + self.parray._auto_move(device_manager.get_parray_id(global_device_id), + write_flag) + """ +-#self.parray._auto_move(device_manager.get_parray_id(self.dev_id), write_flag) + target_dev = self.assigned_devices[0] + global_id = target_dev.get_global_id() + parray_id = device_manager.globalid_to_parrayid(global_id) +@@ -758,7 +767,8 @@ class DataMovementTask(Task): + return TaskRunahead(0) + + def cleanup(self): +- pass ++ _global_data_tasks[id(self)] = None ++ self.parray = None + + ###### + # Task Environment +@@ -1828,9 +1838,3 @@ class BackendTaskSpace(TaskSpace): + + def wait(self): + self.inner_space.wait() +- +- +- +- +- +- +diff --git a/src/python/parla/utility/execute.py b/src/python/parla/utility/execute.py +index 36825aa..eadcaaf 100644 +--- a/src/python/parla/utility/execute.py ++++ b/src/python/parla/utility/execute.py +@@ -261,15 +261,16 @@ def synthetic_kernel_gpu(total_time: int, gil_fraction: Union[Fraction, float], + return None + + +-def create_task_no_data(task, taskspaces, config, data_list=None): ++def create_task_no_data(task, taskspaces, config, ts_postfix=None, data_list=None): + + try: + # Task ID + task_idx = task.task_id.task_idx +- taskspace = taskspaces[task.task_id.taskspace] ++ ts_postfix_str = "" if ts_postfix is None else ts_postfix ++ taskspace = taskspaces[task.task_id.taskspace + ts_postfix_str] + + # Dependency Info +- dependencies = [taskspaces[dep.taskspace][dep.task_idx] ++ dependencies = [taskspaces[dep.taskspace + ts_postfix_str][dep.task_idx] + for dep in task.task_dependencies] + + # Valid Placement Set +@@ -326,15 +327,16 @@ def create_task_no_data(task, taskspaces, config, data_list=None): + return + + +-def create_task_eager_data(task, taskspaces, config=None, data_list=None): ++def create_task_eager_data(task, taskspaces, config=None, ts_postfix=None, data_list=None): + + try: + # Task ID + task_idx = task.task_id.task_idx +- taskspace = taskspaces[task.task_id.taskspace] ++ ts_postfix_str = "" if ts_postfix is None else ts_postfix ++ taskspace = taskspaces[task.task_id.taskspace + ts_postfix_str] + + # Dependency Info +- dependencies = [taskspaces[dep.taskspace][dep.task_idx] ++ dependencies = [taskspaces[dep.taskspace + ts_postfix_str][dep.task_idx] + for dep in task.task_dependencies] + + # Valid Placement Set +@@ -397,14 +399,15 @@ def create_task_eager_data(task, taskspaces, config=None, data_list=None): + if config.gil_fraction is not None: + gil_fraction = config.gil_fraction + +- # print("Eager data in:", IN, " out:", OUT, " inout:", INOUT, flush=True) + """ ++ print("Eager data in:", IN, " out:", OUT, " inout:", INOUT, flush=True) + print("task idx:", task_idx, " dependencies:", dependencies, " vcu:", device_fraction, + " placement:", placement_set) +- """ +- + # TODO(hc): Add data checking. +- @spawn(taskspace[task_idx], dependencies=dependencies, vcus=device_fraction, placement=[placement_set], input=IN, output=OUT, inout=INOUT) ++ print("Memory:", IN[0][0].nbytes) ++ """ ++ # TODO(hc): remove memory operand ++ @spawn(taskspace[task_idx], dependencies=dependencies, vcus=device_fraction, placement=[placement_set], input=IN, output=OUT, inout=INOUT, memory=IN[0][0].nbytes) + async def task_func(): + if config.verbose: + print(f"+{task.task_id} Running", flush=True) +@@ -421,15 +424,16 @@ def create_task_eager_data(task, taskspaces, config=None, data_list=None): + return + + +-def create_task_lazy_data(task, taskspaces, config=None, data_list=None): ++def create_task_lazy_data(task, taskspaces, config=None, ts_postfix=None, data_list=None): + + try: + # Task ID + task_idx = task.task_id.task_idx +- taskspace = taskspaces[task.task_id.taskspace] ++ ts_postfix_str = "" if ts_postfix is None else ts_postfix ++ taskspace = taskspaces[task.task_id.taskspace + ts_postfix_str] + + # Dependency Info +- dependencies = [taskspaces[dep.taskspace][dep.task_idx] ++ dependencies = [taskspaces[dep.taskspace + ts_postfix_str][dep.task_idx] + for dep in task.task_dependencies] + + # Valid Placement Set +@@ -483,8 +487,11 @@ def create_task_lazy_data(task, taskspaces, config=None, data_list=None): + + if config.gil_fraction is not None: + gil_fraction = config.gil_fraction ++ """ + print("task idx:", task_idx, " dependencies:", dependencies, " vcu:", device_fraction, + " placement:", placement_set) ++ ++ """ + + @spawn(taskspace[task_idx], dependencies=dependencies, vcus=device_fraction, placement=[placement_set]) + async def task_func(): +@@ -520,7 +527,7 @@ def create_task_lazy_data(task, taskspaces, config=None, data_list=None): + return + + +-def execute_tasks(taskspaces, tasks: Dict[TaskID, TaskInfo], run_config: RunConfig, data_list=None): ++def execute_tasks(taskspaces, tasks: Dict[TaskID, TaskInfo], run_config: RunConfig, ts_postfix=None, data_list=None): + + spawn_start_t = time.perf_counter() + +@@ -530,32 +537,36 @@ def execute_tasks(taskspaces, tasks: Dict[TaskID, TaskInfo], run_config: RunConf + if run_config.movement_type == MovementType.NO_MOVEMENT: + # print("No data movement") + create_task_no_data(details, taskspaces, +- config=run_config, data_list=data_list) ++ config=run_config, ++ ts_postfix=ts_postfix, ++ data_list=data_list) + elif run_config.movement_type == MovementType.EAGER_MOVEMENT: + # print("Eager data movement") + create_task_eager_data(details, taskspaces, +- config=run_config, data_list=data_list) ++ config=run_config, ++ ts_postfix=ts_postfix, ++ data_list=data_list) + elif run_config.movement_type == MovementType.LAZY_MOVEMENT: + # print("Lazy data movement") + create_task_lazy_data(details, taskspaces, +- config=run_config, data_list=data_list) +- ++ config=run_config, ++ ts_postfix=ts_postfix, ++ data_list=data_list) + spawn_end_t = time.perf_counter() + + return taskspaces + + + def execute_graph(data_config: Dict[int, DataInfo], tasks: Dict[TaskID, TaskInfo], run_config: RunConfig, timing: List[TimeSample]): +- + @spawn(vcus=0) + async def main_task(): + + graph_times = [] ++ # Generate data once for multiple iterations. ++ data_list = generate_data( ++ data_config, run_config.data_scale, run_config.movement_type) + + for i in range(run_config.inner_iterations): +- data_list = generate_data( +- data_config, run_config.data_scale, run_config.movement_type) +- + # Initialize task spaces + taskspaces = {} + +@@ -570,10 +581,76 @@ def execute_graph(data_config: Dict[int, DataInfo], tasks: Dict[TaskID, TaskInfo + + for taskspace in taskspaces.values(): + await taskspace ++ graph_end_t = time.perf_counter() ++ ++ #worker_thread = get_scheduler_context() ++ #worker_thread.scheduler.invoke_all_cparrays_clear() ++ ++ graph_elapsed = graph_end_t - graph_start_t ++ graph_times.append(graph_elapsed) ++ ++ graph_times = np.asarray(graph_times) ++ graph_t = TimeSample(np.mean(graph_times), np.median(graph_times), np.std( ++ graph_times), np.min(graph_times), np.max(graph_times), len(graph_times)) ++ ++ timing.append(graph_t) ++ ++ ++def execute_graph_memory2(data_config: Dict[int, DataInfo], tasks: Dict[TaskID, TaskInfo], run_config: RunConfig, timing: List[TimeSample]): ++ """ ++ This function creates data for each iteration and intentionally decreases ++ its reference count. ++ Data generated in the previous iteration becomes unnecessary and a ++ scheduler's eviction manager evicts those data to CPU. ++ This function requires the following consideration; ++ data should be less than CPU memory. So, the number of iterations or ++ PArray size should not be high. ++ TODO(hc): it might be a separate test. ++ """ ++ @spawn(vcus=0) ++ async def main_task(): ++ ++ graph_times = [] ++ data_list = [] ++ max_memory = 0 ++ for i in range(0, run_config.inner_iterations): ++ import cupy ++ # Initialize task spaces ++ taskspaces = {} ++ ++ data_list.append(generate_data(data_config, run_config.data_scale, run_config.movement_type)) ++ ts_postfix = "-"+str(i) ++ # Create a task space with the postfix of the current iteration. ++ for task, details in tasks.items(): ++ space_name = details.task_id.taskspace ++ space_name += ts_postfix ++ if space_name not in taskspaces: ++ taskspaces[space_name] = TaskSpace(space_name) + ++ for k in range(0, 4): ++ with cupy.cuda.Device(k): ++ mempool = cupy.get_default_memory_pool() ++ print(f"\t Right Before {k} Used GPU{k}: {mempool.used_bytes()}, Free Mmeory: {mempool.free_bytes()}") ++ ++ graph_start_t = time.perf_counter() ++ ++ execute_tasks(taskspaces, tasks, run_config, ts_postfix, data_list=data_list[i]) ++ ++ for taskspace in taskspaces.values(): ++ await taskspace ++ taskspace = None + graph_end_t = time.perf_counter() + ++ #worker_thread = get_scheduler_context() ++ #worker_thread.scheduler.invoke_all_cparrays_clear() ++ ++ for k in range(0, 4): ++ with cupy.cuda.Device(k): ++ mempool = cupy.get_default_memory_pool() ++ print(f"\t Right After {k} Used GPU{k}: {mempool.used_bytes()}, Free Mmeory: {mempool.free_bytes()}") ++ + graph_elapsed = graph_end_t - graph_start_t ++ print("Iteration:", i, ", execution time:", graph_elapsed, flush=True) + graph_times.append(graph_elapsed) + + graph_times = np.asarray(graph_times) +@@ -597,7 +674,7 @@ def run(tasks: Dict[TaskID, TaskInfo], data_config: Dict[int, DataInfo] = None, + + with Parla(logfile=run_config.logfile): + internal_start_t = time.perf_counter() +- execute_graph(data_config, tasks, run_config, timing) ++ execute_graph_memory2(data_config, tasks, run_config, timing) + internal_end_t = time.perf_counter() + + outer_end_t = time.perf_counter() +@@ -797,12 +874,14 @@ class GraphContext(object): + return self + + def run(self, run_config: RunConfig, max_time: int = 100): +- ++ return run(self.graph, self.data_config, run_config) ++ """ + @timeout(max_time) + def run_with_timeout(): + return run(self.graph, self.data_config, run_config) + + return run_with_timeout() ++ """ + + def __exit__(self, type, value, traceback): + self.diro.__exit__(type, value, traceback) diff --git a/src/python/parla/cython/tasks.pyx b/src/python/parla/cython/tasks.pyx index 80be4101..1d8ee49b 100644 --- a/src/python/parla/cython/tasks.pyx +++ b/src/python/parla/cython/tasks.pyx @@ -721,6 +721,7 @@ class DataMovementTask(Task): global_id = target_dev.get_global_id() parray_id = device_manager.globalid_to_parrayid(global_id) + print(self.name, " moves parray:", parray_id, " on device:", target_dev) self.parray._auto_move(parray_id, write_flag) return TaskRunahead(0) @@ -1787,9 +1788,3 @@ class BackendTaskSpace(TaskSpace): def wait(self): self.inner_space.wait() - - - - - - From 22d80669c1f690542f702974a5cad2dd104a4761 Mon Sep 17 00:00:00 2001 From: Hochan Lee Date: Sun, 14 Jan 2024 16:13:50 -0600 Subject: [PATCH 2/5] addcomments --- src/c/backend/include/runtime.hpp | 9 ++++++++- src/c/backend/phases.cpp | 22 ++++++++++------------ 2 files changed, 18 insertions(+), 13 deletions(-) diff --git a/src/c/backend/include/runtime.hpp b/src/c/backend/include/runtime.hpp index 88e06018..d4ab7214 100644 --- a/src/c/backend/include/runtime.hpp +++ b/src/c/backend/include/runtime.hpp @@ -277,7 +277,7 @@ class InnerTask { /* Task Assigned Device Set*/ std::vector assigned_devices; - /*Resource Requirements for each assigned device*/ + /* Resource Requirements for each assigned device*/ std::unordered_map device_constraints; /* Task is data movement task */ @@ -293,6 +293,13 @@ class InnerTask { std::vector>> parray_list; + /* A list of dependency tasks of a parray for this task's dependent tasks. + To be specific, a task sets dependencies of a parray for dependent tasks. + If this task's access permission to a parray includes write, it sets + itself as the dependency of the parray. + If this task's access permission to the parray is read-only, it pulls + this list of the dependencies to this map. + */ std::unordered_map> parray_dependencies_map; InnerTask(); diff --git a/src/c/backend/phases.cpp b/src/c/backend/phases.cpp index 2713e21f..ac48bca8 100644 --- a/src/c/backend/phases.cpp +++ b/src/c/backend/phases.cpp @@ -239,7 +239,8 @@ void MemoryReserver::create_datamove_tasks(InnerTask *task) { - +// TODO(hc): need to think about better naming before it is merged. +// first, need peer-review on this. void MemoryReserver::create_datamove_tasks2(InnerTask *task) { // Get a list of the parrays the current task holds. const std::vector>> @@ -258,28 +259,25 @@ void MemoryReserver::create_datamove_tasks2(InnerTask *task) { task_base_name + ".dm." + std::to_string(i), 0, parray, access_mode, i); uint64_t parray_parent_id = parray->get_parent_parray()->id; - // Find dependency intersection between compute and data movement tasks. - - // TODO(hc): This is not the complete implementation. - // We will use a concurrent map for parray's - // task list as an optimization. - + // Get dependencies std::vector compute_task_dependencies = task->get_dependencies(); std::vector data_task_dependencies; for (size_t k = 0; k < compute_task_dependencies.size(); ++k) { InnerTask *parray_dependency = static_cast(compute_task_dependencies[k]); + // Get dependencies of a parray having `parray_parent_id` that have + // registered to the traversed dependency task std::vector& dep_parray_dependencies = parray_dependency->get_parray_dependencies(parray_parent_id); - std::cout << parray_dependency->name << " is being traversed\n"; + //std::cout << parray_dependency->name << " is being traversed\n"; for (size_t t = 0; t < dep_parray_dependencies.size(); ++t) { data_task_dependencies.push_back(parray_dependency); // If the current processing parray's access mode is READ ONLY, // add this dependency as a dependency for this parray. - std::cout << "access mode:" << int(access_mode) << "\n"; + //std::cout << "access mode:" << int(access_mode) << "\n"; if (access_mode == AccessMode::IN) { - std::cout << "IN parray is added:" << parray_parent_id << "\n"; + //std::cout << "IN parray is added:" << parray_parent_id << "\n"; task->get_parray_dependencies(parray_parent_id).push_back(parray_dependency); } } @@ -287,9 +285,9 @@ void MemoryReserver::create_datamove_tasks2(InnerTask *task) { // If the current processing parray's access mode is not READ ONLY, // add itself as a dependency for this parray. - std::cout << task->name << " is being traversed access id :" << int(access_mode) << "\n"; + //std::cout << task->name << " is being traversed access id :" << int(access_mode) << "\n"; if (access_mode != AccessMode::IN) { - std::cout << "IN/OUT OUT parray is added:" << parray_parent_id << "\n"; + //std::cout << "IN/OUT OUT parray is added:" << parray_parent_id << "\n"; task->get_parray_dependencies(parray_parent_id).push_back(task); } From e0b22a77ee4c705ca85cbb627ef9f203416282a0 Mon Sep 17 00:00:00 2001 From: nicelhc13 Date: Sun, 14 Jan 2024 22:18:16 +0000 Subject: [PATCH 3/5] style: apply ruff format --- src/python/parla/common/dataflow.py | 2 +- src/python/parla/common/parray/coherence.py | 16 +++++++--------- src/python/parla/common/parray/core.py | 9 +++------ src/python/parla/utility/graphs.py | 6 +++--- 4 files changed, 14 insertions(+), 19 deletions(-) diff --git a/src/python/parla/common/dataflow.py b/src/python/parla/common/dataflow.py index 8bdfcf74..254d0b9f 100644 --- a/src/python/parla/common/dataflow.py +++ b/src/python/parla/common/dataflow.py @@ -103,7 +103,7 @@ def process_crosspys( _out.append((parray, i)) else: raise TypeError( - f"Invalid Type: {type(element)}. Dataflow should be PArray, CrossPyArray, or Tuple[PArray, int]" + f"Invalid Type: {type(element)}. Dataflow should be PArray, CrossPyArray, or Tuple[PArray, int]" ) return _out diff --git a/src/python/parla/common/parray/coherence.py b/src/python/parla/common/parray/coherence.py index 62e103a8..ba2c546d 100644 --- a/src/python/parla/common/parray/coherence.py +++ b/src/python/parla/common/parray/coherence.py @@ -27,14 +27,14 @@ class MemoryOperation: # Flag SWITCH_DEVICE_FLAG = ( - 101 - ) # if the flag is set, it means dst is not the current device + 101 # if the flag is set, it means dst is not the current device + ) LOAD_SUBARRAY = ( - 102 - ) # if the flag is set, it means a subarray of src should be loaded + 102 # if the flag is set, it means a subarray of src should be loaded + ) ENSURE_IS_COMPLETE = ( - 103 - ) # if the flag is set, check data will also check if the data is complete + 103 # if the flag is set, check data will also check if the data is complete + ) def __init__(self, inst: int = NOOP, dst: int = -1, src: int = -1, flag: int = []): self.inst = inst @@ -125,9 +125,7 @@ def __init__(self, init_owner: int, num_gpu: int, cyparray_state: CyPArrayState) self._is_complete[CPU_INDEX] = None self._local_states[init_owner] = self.MODIFIED # initial state is MODIFIED - self.owner = ( - init_owner - ) # the device that has the complete copy (take the role of main memory) + self.owner = init_owner # the device that has the complete copy (take the role of main memory) self._versions[init_owner] = 0 # the first version is 0 self._is_complete[init_owner] = True # the copy is complete self._latest_version = 0 # the latest version in the system diff --git a/src/python/parla/common/parray/core.py b/src/python/parla/common/parray/core.py index c4211086..c7dd56ed 100644 --- a/src/python/parla/common/parray/core.py +++ b/src/python/parla/common/parray/core.py @@ -170,13 +170,11 @@ def get_array(self, device_idx: Optional[int] = None) -> ndarray: """ if device_idx is None: - device_idx = self._current_device_index + device_idx = self._current_device_index if self._slices: # so this is a sub-parray object # index into origin array by saved slices - ret = self._array.get_by_global_slices( - device_idx, self._slices[0] - ) + ret = self._array.get_by_global_slices(device_idx, self._slices[0]) for s in self._slices[1:]: ret = ret[s] return ret @@ -214,10 +212,9 @@ def _current_device_index(self) -> int: # to avoid import gpu context, which is slow to setup. return device.device.id # device.device should be a cupy.cuda.Device object - # Public API: - def get(self, device: Optional[PyDevice] = None) -> 'np.ndarray' | 'cp.ndarray': + def get(self, device: Optional[PyDevice] = None) -> "np.ndarray" | "cp.ndarray": if device is None: return self.array else: diff --git a/src/python/parla/utility/graphs.py b/src/python/parla/utility/graphs.py index 072f44f3..d9852dea 100644 --- a/src/python/parla/utility/graphs.py +++ b/src/python/parla/utility/graphs.py @@ -251,8 +251,8 @@ class RunConfig: """ outer_iterations: int = ( - 1 - ) # Number of times to launch the Parla runtime and execute the task graph + 1 # Number of times to launch the Parla runtime and execute the task graph + ) # Number of times to execute the task graph within the same Parla runtime inner_iterations: int = 1 inner_sync: bool = False # Whether to synchronize after each kernel launch @@ -504,7 +504,7 @@ def get_task_properties(line: str): def parse_blog( - filename: str = "parla.blog" + filename: str = "parla.blog", ) -> Tuple[Dict[TaskID, TaskTime], Dict[TaskID, List[TaskID]]]: try: result = subprocess.run( From f7da831c12f4d544653c9535f374d3669533ae72 Mon Sep 17 00:00:00 2001 From: Hochan Lee Date: Sun, 14 Jan 2024 16:20:08 -0600 Subject: [PATCH 4/5] remove unncessary file --- src/dev.patch | 2498 ------------------------------------------------- 1 file changed, 2498 deletions(-) delete mode 100644 src/dev.patch diff --git a/src/dev.patch b/src/dev.patch deleted file mode 100644 index 1789036a..00000000 --- a/src/dev.patch +++ /dev/null @@ -1,2498 +0,0 @@ -diff --git a/src/c/backend/include/memory_manager.hpp b/src/c/backend/include/memory_manager.hpp -new file mode 100644 -index 0000000..b1467ec ---- /dev/null -+++ b/src/c/backend/include/memory_manager.hpp -@@ -0,0 +1,420 @@ -+#ifndef PARLA_MEMORY_MANNGER_HPP -+#define PARLA_MEMORY_MANNGER_HPP -+ -+#include "device_manager.hpp" -+#include "parray.hpp" -+ -+ -+/** -+ * @brief Node type of a PArray eviction double-linked list. -+ */ -+class PArrayNode { -+public: -+ PArrayNode(parray::InnerPArray *parr, size_t prior = 0) : -+ parray(parr), priority(prior), next(nullptr), prev(nullptr) -+ {} -+ -+ /// Pointer of a PArray instance -+ parray::InnerPArray *parray; -+ /// Priority of the node -+ /// TODO(hc): This is not used, but keep it for the future -+ size_t priority; -+ /// Pointers to the next and the previous PArrayNodes -+ PArrayNode *next; -+ PArrayNode *prev; -+}; -+ -+/** -+ * @brief Double-linked list of candidate PArrays for eviction. -+ * @details PArray eviction manager selects and evicts PArray instances -+ * in this list depending on an eviction policy. -+ * Note that an eviction manager manages this list for each device. -+ */ -+class DoubleLinkedList { -+public: -+ -+ /** -+ * @brief Print the current list. -+ */ -+ void print() { -+ PArrayNode *node = this->head_; -+ std::cout << "\n"; -+ while (node != nullptr) { -+ std::cout << node->parray->id << " -> \n"; -+ node = node->next; -+ } -+ std::cout << "\n"; -+ } -+ -+ /** -+ * @brief Append a PArray node to the list. -+ * @detail The first PArray of the list is set to both head and tail, and -+ * the last added PArray is set to tail. -+ * -+ * @param node PArray node to be appended -+ */ -+ void append(PArrayNode *node) { -+ this->mtx_.lock(); -+ if (this->list_size_ == 0) { -+ this->head_ = node; -+ this->tail_ = node; -+ } else { -+ this->tail_->next = node; -+ node->prev = this->tail_; -+ this->tail_ = node; -+ } -+ this->list_size_ += 1; -+ this->mtx_.unlock(); -+ } -+ -+ /** -+ * @brief Insert a PArray node between `node` and `node->next`. -+ * -+ * @param node existing PArray node where `new_node` is being linked -+ * @param new_node PArray node to be appended after `node` -+ */ -+ void insert_after(PArrayNode *node, PArrayNode *new_node) { -+ this->mtx_.lock(); -+ if (node->next != nullptr) { -+ node->next->prev = new_node; -+ new_node->next = node->next; -+ } else { -+ this->tail_ = new_node; -+ } -+ node->next = new_node; -+ new_node->prev = node; -+ this->mtx_.unlock(); -+ } -+ -+ /** -+ * @brief Insert a PArray node between `node` and `node->prev`. -+ * -+ * @param node existing PArray node where `new_node` is being linked -+ * @param new_node PArray node to be appended before `node` -+ */ -+ void insert_before(PArrayNode *node, PArrayNode *new_node) { -+ this->mtx_.lock(); -+ if (node->prev != nullptr) { -+ node->prev->next = new_node; -+ new_node->prev = node->prev; -+ } else { -+ this->head_ = new_node; -+ } -+ node->prev = new_node; -+ new_node->next = node; -+ this->mtx_.unlock(); -+ } -+ -+ /** -+ * @brief Remove and return the current head PArray node from a list. -+ */ -+ PArrayNode *remove_head() { -+ this->mtx_.lock(); -+ PArrayNode *old_head = this->head_; -+ if (old_head != nullptr) { -+ this->remove_unsafe(old_head); -+ } -+ this->mtx_.unlock(); -+ return old_head; -+ } -+ -+ /** -+ * @brief Remove a node and return true if it is removed false otherwise. -+ * -+ * @param node PArray node to be removed from a list -+ */ -+ bool remove(PArrayNode *node) { -+ this->mtx_.lock(); -+ bool rv = this->remove_unsafe(node); -+ this->mtx_.unlock(); -+ return rv; -+ } -+ -+ /** -+ * @brief Remove a node and return true if it is removed false otherwise. -+ * This function is not thread safe. -+ * -+ * @param node PArray node to be removed from a list -+ */ -+ bool remove_unsafe(PArrayNode *node) { -+ if (node->prev == nullptr && node->next == nullptr && -+ node != this->head_ && node != this->tail_) { -+ // If a node is not in a list, do nothing and return false. -+ return false; -+ } -+ -+ if (this->list_size_ == 1) { -+ // A node is a single node in a list. -+ this->head_ = this->tail_ = nullptr; -+ } else { -+ if (this->head_ == node) { -+ // A node is a head, and so break link of node->next->prev. -+ this->head_ = node->next; -+ node->next->prev = nullptr; -+ } else if (this->tail_ == node) { -+ // A node is a tail, and so break link of node->prev->next. -+ this->tail_ = node->prev; -+ node->prev->next = nullptr; -+ } else { -+ // A node is in the middle of a list, and so break two links. -+ node->prev->next = node->next; -+ node->next->prev = node->prev; -+ } -+ } -+ node->prev = node->next = nullptr; -+ this->list_size_ -= 1; -+ return true; -+ } -+ -+ /** -+ * @brief Return a size of a list. -+ */ -+ size_t size() { -+ this->mtx_.lock(); -+ size_t list_size = this->list_size_; -+ this->mtx_.unlock(); -+ return list_size; -+ } -+ -+ /** -+ * @brief Return the current head. -+ * This function is not thread safe. -+ */ -+ PArrayNode *get_head() { -+ return this->head_; -+ } -+ -+ /** -+ * @brief Return the current tail. -+ * This function is not thread safe. -+ */ -+ PArrayNode *get_tail() { -+ return this->tail_; -+ } -+ -+private: -+ PArrayNode *head_{nullptr}; -+ PArrayNode *tail_{nullptr}; -+ std::mutex mtx_; -+ size_t list_size_{0}; -+}; -+ -+ -+/** -+ * @brief Least-recently-used policy based eviction manager for a device. -+ * @details It holds PArrays which are not referenced by tasks which are -+ * between task mapping and termination phases. -+ */ -+class LRUDeviceEvictionManager { -+public: -+ struct PArrayMetaInfo { -+ // Points to a PArray node if it exists -+ PArrayNode *parray_node_ptr; -+ // The number of references to a PArray -+ size_t ref_count; -+ }; -+ -+ LRUDeviceEvictionManager(DevID_t dev_id) : dev_id_(dev_id) {} -+ -+ /** -+ * @brief A task refers `parray` in the device. -+ * @detail This function is called when a task being mapped -+ * refers `parray`. This increases a reference count of the PArray -+ * and removes it from a zero-referenced list if it exists. -+ * -+ * @param parray pointer to a parray to be referred by a task -+ */ -+ void grab_parray_reference(parray::InnerPArray *parray) { -+ this->mtx_.lock(); -+ uint64_t parray_id = parray->id; -+ auto found = this->parray_reference_counts_.find(parray_id); -+ if (found == this->parray_reference_counts_.end()) { -+ // Add `parray` to a zr list if it does not exist. -+ PArrayNode *parray_node = new PArrayNode(parray); -+ this->parray_reference_counts_[parray_id] = -+ PArrayMetaInfo{parray_node, 1}; -+ } else { -+ // If `parray` is already in a zr list, removes it -+ // from the list and increases its reference count. -+ found->second.ref_count++; -+ this->zr_parray_list_.remove(found->second.parray_node_ptr); -+ } -+ this->mtx_.unlock(); -+ } -+ -+ /** -+ * @brief A task is finished and releases `parray` in the device. -+ * @detail This function is called by a worker thread when a task -+ * assigned to that thread is completed. The thread releases the -+ * `parray` instance, and decreases its reference count in the device. -+ * If the reference count becomes 0, the `parray` is added to -+ * the zero-referenced list. -+ * -+ * @param parray pointer to a parray to be released by a task -+ */ -+ void release_parray_reference(parray::InnerPArray *parray) { -+ this->mtx_.lock(); -+ uint64_t parray_id = parray->id; -+ auto found = this->parray_reference_counts_.find(parray_id); -+ if (found != this->parray_reference_counts_.end()) { -+ found->second.ref_count--; -+ if (found->second.ref_count == 0) { -+ // If none of task referes to `parray`, add it to -+ // a zr list. -+ this->zr_parray_list_.append(found->second.parray_node_ptr); -+ } -+ } -+ this->mtx_.unlock(); -+ } -+ -+ -+ /** -+ * @brief Return a size of a list. -+ */ -+ size_t size() { -+ size_t zr_parray_list_size{0}; -+ this->mtx_.lock(); -+ zr_parray_list_size = zr_parray_list_.size(); -+ this->mtx_.unlock(); -+ return zr_parray_list_size; -+ } -+ -+ /** -+ * @brief Remove and return a head of the zero-referenced list. -+ * @detail This function is not thread safe since it assumes that only -+ * the scheduler thread calls into this function during eviction. -+ */ -+ PArrayNode *remove_and_return_head_from_zrlist() { -+ PArrayNode* old_head{nullptr}; -+ this->mtx_.lock(); -+ old_head = this->zr_parray_list_.remove_head(); -+ this->mtx_.unlock(); -+ return old_head; -+ } -+ -+ /** -+ * @brief This function clears all existing PArrays in the -+ * zero-referenced list. -+ * @detail This function has two purposes. -+ * First, it is used to fix unlinked Python and C++ PArray -+ * instances. It is possible that Python PArrays are destroyed -+ * due to, for example, out-of-scope. Then, C++ PArrays -+ * start to hold invalid Python PArray pointers. -+ * When a scheduler starts PArray eviction, it is possible that -+ * the C++ PArrays holding invalid Python PArrays are chosen -+ * as evictable PArrays and causes segmentation fault. -+ * This function removes those PArrays in advance to avoid -+ * this issue (But users should be aware of and take care of this scenario). -+ * The second purpose is to allow users to clear all memory -+ * related states managed by the Parla runtime. -+ */ -+ // TODO(hc): This bulk flushing is not ideal IMO. The Parla runtime -+ // should provide a function that flushes only a single PArray. -+ // I am postponing this work since we need to take care of -+ // the zero-referenced list, but I have higher priorities. -+ void clear_all_instances() { -+ this->mtx_.lock(); -+ PArrayNode* head{nullptr}; -+ do { -+ head = this->zr_parray_list_.remove_head(); -+ } while (head != nullptr); -+ this->mtx_.unlock(); -+ } -+ -+private: -+ /// This eviction manager manages PArray instances in this device -+ DevID_t dev_id_; -+ std::mutex mtx_; -+ /// Key: PArray ID, Value: Meta information including reference -+ /// count of a PArray -+ std::unordered_map parray_reference_counts_; -+ /// A list of zero-referenced PArrays. -+ DoubleLinkedList zr_parray_list_; -+}; -+ -+ -+/** -+ * @brief Least-recently-used policy based global eviction manager. -+ * @details External components access and manipulate PArray instances in any -+ * device through this manager. -+ */ -+class LRUGlobalEvictionManager { -+public: -+ LRUGlobalEvictionManager(DeviceManager *device_manager) : -+ device_manager_(device_manager) { -+ this->device_mm_.resize( -+ device_manager->template get_num_devices()); -+ for (size_t i = 0; i < this->device_mm_.size(); ++i) { -+ this->device_mm_[i] = new LRUDeviceEvictionManager(i); -+ } -+ } -+ -+ /** -+ * @brief A task refers `parray` in `dev_id` device. -+ * @detail This function is called when a task being mapped -+ * refers `parray`. This increases a reference count of the PArray -+ * and removes it from a zero-referenced list if it exists. -+ * -+ * @param parray pointer to a parray to be referred by a task -+ * @param dev_id device id of a device to access its information -+ */ -+ void grab_parray_reference(parray::InnerPArray *parray, DevID_t dev_id) { -+ this->device_mm_[dev_id]->grab_parray_reference(parray); -+ } -+ -+ /** -+ * @brief A task is finished and releases `parray` in `dev_id` device. -+ * @detail This function is called by a worker thread when a task -+ * assigned to that completes. So, the thread also releases a -+ * `parray`, and decreases a reference count of that in the device. -+ * If the reference count becomes 0, the `parray` is added to -+ * its zero-referenced list. -+ * -+ * @param parray pointer to a parray to be released by a task -+ * @param dev_id device id of a device to access its information -+ */ -+ void release_parray_reference(parray::InnerPArray *parray, DevID_t dev_id) { -+ this->device_mm_[dev_id]->release_parray_reference(parray); -+ } -+ -+ /** -+ * @brief Return a size of a list. -+ * -+ * @param dev_id device id of a device to access its information -+ */ -+ size_t size(DevID_t dev_id) { -+ return this->device_mm_[dev_id]->size(); -+ } -+ -+ /** -+ * @brief Remove and return a head of the zero-referenced list. -+ * @detail This function is not thread safe since it assumes that only -+ * the scheduler thread calls into this function during eviction. -+ * -+ * @param dev_id device id of a device to access its information -+ */ -+ void *remove_and_return_head_from_zrlist(DevID_t dev_id) { -+ PArrayNode *old_head = -+ this->device_mm_[dev_id]->remove_and_return_head_from_zrlist(); -+ void *py_parray{nullptr}; -+ if (old_head != nullptr) { -+ parray::InnerPArray *c_parray = old_head->parray; -+ py_parray = c_parray->get_py_parray(); -+ } -+ return py_parray; -+ } -+ -+ void clear_all_instances() { -+ for (size_t i = 0; i < device_mm_.size(); ++i) { -+ device_mm_[i]->clear_all_instances(); -+ } -+ } -+ -+private: -+ /// Device manager managing system environment -+ DeviceManager *device_manager_; -+ /// A list of LRU-based eviction manager for each device -+ std::vector device_mm_; -+}; -+ -+#endif -diff --git a/src/c/backend/include/parray.hpp b/src/c/backend/include/parray.hpp -index 68d3de6..aa6fa56 100644 ---- a/src/c/backend/include/parray.hpp -+++ b/src/c/backend/include/parray.hpp -@@ -30,8 +30,8 @@ enum AccessMode { - INOUT = 2, - /// Output of a task. - OUT = 3, -- /// Removed PArray (false everywhere). -- REMOVED = 4, -+ /// Freed PArray (false everywhere). -+ FREED = 4, - /// Deleted PArray (removed from table). - DELETED = 5, - }; -@@ -59,7 +59,7 @@ public: - /// compared to restructuring overheads), but this counter is decreased. - /// This is used to provide more accurate PArray placement information - /// to the task mapping step. -- std::vector> num_active_tasks; -+ std::vector> num_referring_tasks; - - InnerPArray() = delete; - InnerPArray(void *, uint64_t, uint64_t, InnerPArray *, PArrayState *, -@@ -92,14 +92,14 @@ public: - /// Add a pointer of the task that will use this PArray to the task list - void add_task(InnerTask *task); - -- /// Increase the counter for the active tasks that use this PArray. -- void incr_num_active_tasks(DevID_t global_dev_id); -+ /// Increase the number of the tasks referring this PArray. -+ void incr_num_referring_tasks(DevID_t global_dev_id); - -- /// Decrease the counter for the active tasks that use this PArray. -- void decr_num_active_tasks(DevID_t global_dev_id); -+ /// Decrease the number of the tasks referring this PArray. -+ void decr_num_referring_tasks(DevID_t global_dev_id); - -- /// Get the number of counter for the active tasks that use this PArray. -- size_t get_num_active_tasks(DevID_t global_dev_id); -+ /// Get the number of the tasks referring this PArray. -+ size_t get_num_referring_tasks(DevID_t global_dev_id); - - // TODO(hc): I will replace this list with a concurrent map. - /// Get a reference to a list of tasks who are using this PArray -@@ -136,8 +136,8 @@ private: - // I will use this map: https://github.com/greg7mdp/parallel-hashmap - // I have used this for a while and it is good. - TaskList _task_lists; -- -+ /// Python PArray address - void *_py_parray; - }; - --} // namespace parray -\ No newline at end of file -+} // namespace parray -diff --git a/src/c/backend/include/parray_tracker.hpp b/src/c/backend/include/parray_tracker.hpp -index b49c62f..c46141b 100644 ---- a/src/c/backend/include/parray_tracker.hpp -+++ b/src/c/backend/include/parray_tracker.hpp -@@ -13,6 +13,10 @@ - - using namespace parray; - -+/** -+ * @brief PArray tracker that tracks PArray mapping state. -+ * Note that this does not track slice PArrays, but a complete PArray. -+ */ - class PArrayTracker { - public: - PArrayTracker(size_t num_devices) : num_devices_(num_devices) { -@@ -204,7 +208,7 @@ protected: - - /// Any worker thread can update states of PArrays. - /// Guard operations by this lock. -- /// TODO(hc): This will be replaced with parallel hash map. -+ /// TODO(hc): This will be replaced with parallel hash map (phmap). - std::mutex mtx; - }; - -diff --git a/src/c/backend/include/runtime.hpp b/src/c/backend/include/runtime.hpp -index f38cfac..17f72b3 100644 ---- a/src/c/backend/include/runtime.hpp -+++ b/src/c/backend/include/runtime.hpp -@@ -44,6 +44,7 @@ - #include "parray_tracker.hpp" - #include "profiling.hpp" - #include "resource_requirements.hpp" -+#include "memory_manager.hpp" - - using namespace std::chrono_literals; - using namespace parray; -@@ -985,9 +986,26 @@ public: - /* Should Run, Stop Condition */ - std::atomic should_run = true; - -+ /* Clear all Python PArray references if users request */ -+ std::atomic clear_all_pyparrays = false; -+ /* Clear all C PArray objects if users request in eviction manager */ -+ std::atomic clear_all_cparrays = false; -+ - /* Phase: maps tasks to devices */ - Mapper *mapper; - -+ /* If it is set, break an infinite loop in InnerScheduler::run() -+ and invoke PArray eviction from PythonScheduler */ -+ bool break_for_eviction = false; -+ /* Memory size to evict for each device */ -+ std::vector memory_size_to_evict{0}; -+ -+ /* Set necessary memory size getting from eviction manager -+ on each device */ -+ void set_memory_size_to_evict(size_t, DevID_t); -+ /* Get memory size to evict for each device */ -+ size_t get_memory_size_to_evict(DevID_t); -+ - /* Phase reserves resources to limit/plan task execution*/ - MemoryReserver *memory_reserver; - RuntimeReserver *runtime_reserver; -@@ -995,7 +1013,7 @@ public: - /*Responsible for launching a task. Signals worker thread*/ - Launcher *launcher; - -- InnerScheduler(DeviceManager *device_manager); -+ InnerScheduler(LRUGlobalEvictionManager *memory_manager, DeviceManager *device_manager); - ~InnerScheduler(); - // InnerScheduler(int nworkers); - -@@ -1017,6 +1035,9 @@ public: - /* Set Python "stop" callback */ - void set_stop_callback(stopfunc_t stop_callback); - -+ /* Get a flag that represents if there is still a task to be executed */ -+ bool get_should_run(); -+ - /* Run the scheduler thread. Active for the lifetime of the Parla program */ - void run(); - -@@ -1087,6 +1108,20 @@ public: - /* Release a PArray in a device */ - void remove_parray(InnerPArray *parray, DevID_t global_dev_id); - -+ /* Reflect a PArray removal to PArray trackers */ -+ void remove_parray_from_tracker( -+ parray::InnerPArray *parray, DevID_t global_dev_id); -+ -+ void grab_parray_reference( -+ parray::InnerPArray *parray, DevID_t global_dev_id) { -+ this->mm_->grab_parray_reference(parray, global_dev_id); -+ } -+ -+ void release_parray_reference( -+ parray::InnerPArray *parray, DevID_t global_dev_id) { -+ this->mm_->release_parray_reference(parray, global_dev_id); -+ } -+ - /* Get mapped memory on device */ - size_t get_mapped_memory(DevID_t global_dev_id); - -@@ -1108,10 +1143,17 @@ public: - - DeviceManager *get_device_manager() { return this->device_manager_; } - -+ /* Invoke clearing all C PArray instances from a PArray eviction manager. */ -+ void invoke_all_cparrays_clear(); -+ -+ /* Get a flag that represents whether Python PArrays should be cleared */ -+ bool get_all_pyparrays_clear_flag(); - protected: - /// It manages all device instances in C++. - /// This is destructed by the Cython scheduler. - DeviceManager *device_manager_; -+ -+ LRUGlobalEvictionManager *mm_; - }; - - #endif // PARLA_BACKEND_HPP -diff --git a/src/c/backend/memory_manager.cpp b/src/c/backend/memory_manager.cpp -new file mode 100644 -index 0000000..f829294 ---- /dev/null -+++ b/src/c/backend/memory_manager.cpp -@@ -0,0 +1 @@ -+#include "include/memory_manager.hpp" -diff --git a/src/c/backend/parray.cpp b/src/c/backend/parray.cpp -index 5e555d4..20cb46a 100644 ---- a/src/c/backend/parray.cpp -+++ b/src/c/backend/parray.cpp -@@ -9,7 +9,7 @@ InnerPArray::InnerPArray(void *py_parray, uint64_t id, uint64_t parent_id, - DevID_t num_devices) - : _py_parray(py_parray), id(id), parent_id(parent_id), - _parent_parray(parent_parray), _state(state), _num_devices(num_devices) { -- num_active_tasks.resize(num_devices); -+ num_referring_tasks.resize(num_devices); - } - - const uint64_t InnerPArray::get_size() const { return this->_size; } -@@ -29,32 +29,32 @@ void InnerPArray::add_task(InnerTask *task) { - this->_task_lists.push_back(task); - } - --void InnerPArray::incr_num_active_tasks(DevID_t global_dev_id) { -+void InnerPArray::incr_num_referring_tasks(DevID_t global_dev_id) { - if (this->_parent_parray != nullptr) { -- this->_parent_parray->num_active_tasks[global_dev_id].fetch_add( -+ this->_parent_parray->num_referring_tasks[global_dev_id].fetch_add( - 1, std::memory_order_relaxed); - } else { -- this->num_active_tasks[global_dev_id].fetch_add(1, -+ this->num_referring_tasks[global_dev_id].fetch_add(1, - std::memory_order_relaxed); - } - } - --void InnerPArray::decr_num_active_tasks(DevID_t global_dev_id) { -+void InnerPArray::decr_num_referring_tasks(DevID_t global_dev_id) { - if (this->_parent_parray != nullptr) { -- this->_parent_parray->num_active_tasks[global_dev_id].fetch_sub( -+ this->_parent_parray->num_referring_tasks[global_dev_id].fetch_sub( - 1, std::memory_order_relaxed); - } else { -- this->num_active_tasks[global_dev_id].fetch_sub(1, -+ this->num_referring_tasks[global_dev_id].fetch_sub(1, - std::memory_order_relaxed); - } - } - --size_t InnerPArray::get_num_active_tasks(DevID_t global_dev_id) { -+size_t InnerPArray::get_num_referring_tasks(DevID_t global_dev_id) { - if (this->_parent_parray != nullptr) { -- return this->_parent_parray->num_active_tasks[global_dev_id].load( -+ return this->_parent_parray->num_referring_tasks[global_dev_id].load( - std::memory_order_relaxed); - } else { -- return this->num_active_tasks[global_dev_id].load( -+ return this->num_referring_tasks[global_dev_id].load( - std::memory_order_relaxed); - } - } -diff --git a/src/c/backend/parray_tracker.cpp b/src/c/backend/parray_tracker.cpp -index ac5e88d..e5226d7 100644 ---- a/src/c/backend/parray_tracker.cpp -+++ b/src/c/backend/parray_tracker.cpp -@@ -12,7 +12,6 @@ size_t PArrayTracker::do_parray_creation_(AccessMode access_mode, - if (access_mode >= AccessMode::OUT || is_tracked) { - return 0; - } else { -- - size_t to_move = parray->get_size(); - - // std::cout << "PArrayTracker::do_parray_creation " << std::endl; -@@ -65,8 +64,8 @@ size_t PArrayTracker::do_parray_removal_(AccessMode access_mode, DevID_t dev_id, - this->set_parray_unsafe(i, parray_id, false); - } - -- if (access_mode != AccessMode::REMOVED) { -- this->set_parray_unsafe(dev_id, parray_id, false); -+ if (access_mode != AccessMode::FREED) { -+ this->set_parray_unsafe(dev_id, parray_id, true); - } - - if (is_slice) { -@@ -92,7 +91,6 @@ size_t PArrayTracker::do_parray_removal_(AccessMode access_mode, DevID_t dev_id, - } - } - */ -- - } else { - // If the PArray is not a slice, it is a parent - // invalidate all its children on all devices except the target -@@ -104,7 +102,7 @@ size_t PArrayTracker::do_parray_removal_(AccessMode access_mode, DevID_t dev_id, - this->set_parray_unsafe(i, child_id, false); - } - -- if (access_mode != AccessMode::REMOVED) { -+ if (access_mode != AccessMode::FREED) { - this->set_parray_unsafe(dev_id, child_id, false); - } - } -@@ -217,4 +215,4 @@ size_t PArrayTracker::check_log(const DevID_t dev_id, - // std::cout << "Checked parray removal: " << to_move << std::endl; - - return to_move; --} -\ No newline at end of file -+} -diff --git a/src/c/backend/phases.cpp b/src/c/backend/phases.cpp -index 79eef62..6975c82 100644 ---- a/src/c/backend/phases.cpp -+++ b/src/c/backend/phases.cpp -@@ -93,7 +93,7 @@ void Mapper::map_task(InnerTask *task, DeviceRequirementList &chosen_devices) { - mapped_pool.increase(mapped_size); - - InnerPArray *parray = parray_access.first; -- parray->incr_num_active_tasks(global_dev_id); -+ parray->incr_num_referring_tasks(global_dev_id); - } - } - -@@ -132,6 +132,14 @@ void Mapper::run(SchedulerPhase *next_phase) { - this->drain_parray_buffer(); - - // TODO Fix Issue #108 -+ /** XXX(hc): revisit this -+ // Accumulate necessary memory size for PArrays being mapped. -+ std::vector accum_necessary_memory; -+ // Eviction manager does not evict CPU instances; In the future, -+ // this also might be evicted to other devices. But to simplify indexing, -+ // allocate as the number of total devices. -+ accum_necessary_memory.resize(this->device_manager->get_num_devices(DeviceType::All)); -+ ***/ - while (has_task && num_task_mapping_attempt < 20) { - - this->drain_parray_buffer(); -@@ -277,6 +285,9 @@ bool MemoryReserver::check_data_resources(InnerTask *task) { - InnerPArray *parray = parray_access.first; - AccessMode access_mode = parray_access.second; - -+ // Register this PArray to eviction manager's table -+ this->scheduler->grab_parray_reference(parray, local_device_idx); -+ - // If the PArray is not an input, then we don't need to check size - // Note(@dialecticDolt): - // There is literally no such thing as an out type in our syntax why do we -@@ -305,6 +316,10 @@ bool MemoryReserver::check_data_resources(InnerTask *task) { - - status = status && device_status; - if (!status) { -+ // If a device has not enough memory, activate eviction manager -+ this->scheduler->set_memory_size_to_evict( -+ size_on_device * 2, local_device_idx); -+ this->scheduler->break_for_eviction = true; - break; - } - } -diff --git a/src/c/backend/policy.cpp b/src/c/backend/policy.cpp -index 1cf1c76..89a85fb 100644 ---- a/src/c/backend/policy.cpp -+++ b/src/c/backend/policy.cpp -@@ -12,6 +12,8 @@ bool LocalityLoadBalancingMappingPolicy::calc_score_devplacement( - // std::cout << "[Locality-aware- and Load-balancing mapping policy]\n"; - - // Check device resource availability. -+ // Note that this counter is tracking 'launched' or 'reserved' resource -+ // status. - if (!device.check_resource_availability(dev_placement_req.get())) { - // std::cout << "Device resource failure!" << std::endl; - return false; -diff --git a/src/c/backend/scheduler.cpp b/src/c/backend/scheduler.cpp -index a6fadfc..95152aa 100644 ---- a/src/c/backend/scheduler.cpp -+++ b/src/c/backend/scheduler.cpp -@@ -126,8 +126,18 @@ template class WorkerPool; - - // Scheduler Implementation - --InnerScheduler::InnerScheduler(DeviceManager *device_manager) -- : device_manager_(device_manager) { -+InnerScheduler::InnerScheduler(LRUGlobalEvictionManager* memory_manager, -+ DeviceManager *device_manager) -+ : device_manager_(device_manager), mm_(memory_manager) { -+ -+ // For now, it does not evict PArrays on CPU memory. -+ this->memory_size_to_evict.resize( -+ device_manager->template get_num_devices()); -+ -+ // A dummy task count is used to keep the scheduler alive. -+ // NOTE: At least one task must be added to the scheduler by the main thread, -+ // otherwise the runtime will finish immediately -+ // this->increase_num_active_tasks(); - - this->workers.set_num_workers(1); - -@@ -157,31 +167,70 @@ void InnerScheduler::set_stop_callback(stopfunc_t stop_callback) { - this->stop_callback = stop_callback; - } - -+bool InnerScheduler::get_should_run() { -+ return this->should_run.load(); -+} -+ -+bool InnerScheduler::get_all_pyparrays_clear_flag() { -+ return this->clear_all_pyparrays.load(); -+} -+ -+void InnerScheduler::set_memory_size_to_evict( -+ size_t size, DevID_t dev_id) { -+ this->memory_size_to_evict[dev_id] = size; -+} -+ -+size_t InnerScheduler::get_memory_size_to_evict(DevID_t dev_id) { -+ return this->memory_size_to_evict[dev_id]; -+} -+ - void InnerScheduler::run() { - NVTX_RANGE("Scheduler::run", NVTX_COLOR_RED) -- unsigned long long iteration_count = 0; -+ this->clear_all_cparrays = false; -+ this->clear_all_pyparrays = false; - while (this->should_run.load()) { -+ this->break_for_eviction = false; - auto status = this->activate(); - if (this->sleep_flag) { - std::this_thread::sleep_for(std::chrono::milliseconds(this->sleep_time)); - } -+ if (this->break_for_eviction) { -+ // Yield a control to a Python scheduler to evict PArrays since -+ // PArray coherency protocol is managed at there. -+ break; -+ } -+ if (this->clear_all_cparrays.load()) { -+ // TODO(hc): This should be more generalized and refined. -+ // Temporarily use it as experimental puprose. -+ std::cout << "Clear all C/Python parrays..\n"; -+ this->mm_->clear_all_instances(); -+ this->clear_all_pyparrays = true; -+ break; -+ } - } - } - - void InnerScheduler::stop() { - LOG_INFO(SCHEDULER, "Stopping scheduler"); - this->should_run = false; -- launch_stop_callback(this->stop_callback, this->py_scheduler); -+ // XXX(hc): To process PArray eviction on Python, -+ // Python scheduler now has an while loop that iterates until there is -+ // no more task, and it wraps C scheduler's loop. -+ // Therefore, there is no point for C++ scheduler to explicitly invoke -+ // this callback at here. Python scheduler knows when it needs to stop. -+ //launch_stop_callback(this->stop_callback, this->py_scheduler); - LOG_INFO(SCHEDULER, "Stopped scheduler"); - } - -+void InnerScheduler::invoke_all_cparrays_clear() { -+ this->clear_all_cparrays = true; -+} -+ - Scheduler::Status InnerScheduler::activate() { - // std::cout<< "Scheduler Activated" << std::endl; -- - this->mapper->run(this->memory_reserver); - this->memory_reserver->run(this->runtime_reserver); - this->runtime_reserver->run(this->launcher); -- - // LOG_TRACE(SCHEDULER, "ReadyPhase Status: {}", this->runtime_reserver); - return this->status; - } -@@ -289,6 +338,15 @@ void InnerScheduler::remove_parray(InnerPArray *parray, DevID_t global_dev_id) { - // Could also be to call DELETED or REMOVED status on do_log - } - -+void InnerScheduler::remove_parray_from_tracker( -+ parray::InnerPArray *parray, DevID_t global_dev_id) { -+ AccessMode access_mode = AccessMode::FREED; -+ this->mapper->get_parray_tracker()->do_log(global_dev_id, -+ std::make_pair(parray, access_mode)); -+ this->memory_reserver->get_parray_tracker()->do_log(global_dev_id, -+ std::make_pair(parray, access_mode)); -+} -+ - size_t InnerScheduler::get_mapped_memory(DevID_t global_dev_idx) { - Device *device = - this->device_manager_->get_device_by_global_id(global_dev_idx); -@@ -362,7 +420,12 @@ void InnerScheduler::task_cleanup_postsync(InnerWorker *worker, InnerTask *task, - for (size_t j = 0; j < parray_access_list.size(); ++j) { - auto &parray_access = parray_access_list[j]; - InnerPArray *parray = parray_access.first; -- parray->decr_num_active_tasks(dev_id); -+ parray->decr_num_referring_tasks(dev_id); -+ // Decrease this PArray's reference count. -+ // If this becomes 0, this instance will be release -+ // when the PArray coherency protocol updates it -+ // to eviction state. -+ this->release_parray_reference(parray, dev_id); - } - } - -diff --git a/src/python/parla/__init__.py b/src/python/parla/__init__.py -index ba53b56..49ca47a 100644 ---- a/src/python/parla/__init__.py -+++ b/src/python/parla/__init__.py -@@ -7,6 +7,7 @@ from .cython import core - from .cython import device_manager - from .cython import device - from .cython import variants -+from .cython import mm - from .common.spawn import spawn - from .common import parray - -@@ -22,6 +23,7 @@ TaskSpace = tasks.TaskSpace - Tasks = tasks.TaskCollection - - DeviceManager = device_manager.PyDeviceManager -+PyMM = mm.PyMM - - Stream = device.Stream - create_env = tasks.create_env -@@ -59,6 +61,7 @@ class Parla: - self.sig = sig_type - self.handle_interrupt = True - self._device_manager = DeviceManager(dev_config_file) -+ self._memory_manager = PyMM(self._device_manager) - - if logfile is None: - logfile = os.environ.get("PARLA_LOGFILE", None) -@@ -77,7 +80,9 @@ class Parla: - if hasattr(self, "_sched"): - raise ValueError( - "Do not use the same Parla object more than once.") -- self._sched = self.scheduler_class(self._device_manager, **self.kwds) -+ self._sched = self.scheduler_class(self._memory_manager, -+ self._device_manager, -+ **self.kwds) - - self.interuppted = False - self.released = False -diff --git a/src/python/parla/common/parray/@ b/src/python/parla/common/parray/@ -new file mode 100644 -index 0000000..af841d1 ---- /dev/null -+++ b/src/python/parla/common/parray/@ -@@ -0,0 +1,541 @@ -+from __future__ import annotations -+from typing import TYPE_CHECKING, Union, List, Dict, Tuple, Any -+ -+import numpy -+import ctypes -+ -+#TODO: Fix this to be more stable and less of a hack. -+try: -+ import cupy -+except (ImportError, AttributeError): -+ import numpy as cupy -+ -+from .coherence import CPU_INDEX -+ -+if TYPE_CHECKING: # False at runtime -+ import cupy -+ ndarray = Union[numpy.ndarray, cupy.ndarray] -+ SlicesType = Union[slice, int, tuple] -+ IndicesMapType = List[Union[Dict[int, int], tuple]] -+ from parla.cython.cyparray_state import CyPArrayState -+ -+class MultiDeviceBuffer: -+ """Underlying Buffer of PArray. -+ -+ It holds per device array copy and also index mapping. -+ """ -+ -+ _buffer: Dict[int, ndarray | List[ndarray] | None] -+ shape: tuple -+ _indices_map: Dict[int, List[IndicesMapType] | None] -+ _cyparray_state: CyPArrayState -+ -+ def __init__(self, num_gpu: int, cyparray_state: CyPArrayState): -+ # per device buffer -+ # key: device_id -+ # val: single (complete) ndarray or list of (sub) ndarray -+ self._buffer = {n: None for n in range(num_gpu)} # add gpu id -+ self._buffer[CPU_INDEX] = None # add cpu id -+ -+ # per device indices mapping -+ # key: device_id -+ # val: list of {global_index: local_index} and tuple(begin, end, stop), and the tuple is a represent of slice(begin, end, stop) -+ self._indices_map = {n: None for n in range(num_gpu)} -+ self._indices_map[CPU_INDEX] = None -+ -+ # the shape of the complete array -+ self.shape = () -+ -+ self._cyparray_state = cyparray_state -+ -+ def nbytes_at(self, device_id:int) -> int: -+ """ -+ Return the buffer size at `device_id` -+ """ -+ buffer = self._buffer[device_id] -+ if buffer is None: -+ return 0 -+ elif isinstance(buffer, list): # subarray at this device buffer -+ # size is the sum -+ nbytes = 0 -+ for subarray in buffer: -+ nbytes += subarray.nbytes -+ return nbytes -+ else: # complete array -+ return buffer.nbytes -+ -+ def set_complete_array(self, array: ndarray) -> int: -+ """ -+ Add array into the buffer (based on array's device). -+ -+ Args: -+ array: :class:`cupy.ndarray` or :class:`numpy.array` object -+ -+ Return: -+ a location (device_id) of the array -+ """ -+ # get the array's location -+ if isinstance(array, numpy.ndarray): -+ location = CPU_INDEX -+ else: -+ location = int(array.device) -+ -+ self._buffer[location] = array -+ self.shape = array.shape -+ self._cyparray_state.set_exist_on_device(location, True) -+ return location -+ -+ def set(self, device_id: int, array: ndarray, is_complete: bool = True, overwrite: bool = False) -> None: -+ """ -+ Set copy at a device, also clean up existing `indices_map` if necessary -+ -+ Args: -+ device_id: gpu device_id or CPU_INDEX -+ array: :class:`cupy.ndarray` or :class:`numpy.array` object -+ is_complete: True if `array` is a complete copy, otherwise `array` is a subarray -+ overwrite: True if need to clean other subarray copy inside the device before assign the new array -+ """ -+ if is_complete: -+ self._indices_map[device_id] = None -+ self._buffer[device_id] = array -+ else: -+ if not isinstance(self._buffer[device_id], List) or overwrite: -+ self._indices_map[device_id] = None -+ self._buffer[device_id] = [array] -+ else: -+ self._buffer[device_id].append(array) -+ self._cyparray_state.set_exist_on_device(device_id, True) -+ -+ def get(self, device_id: int) -> ndarray | List[ndarray] | None: -+ """ -+ Return the copy at a device -+ -+ Args: -+ device_id: gpu device_id or CPU_INDEX -+ -+ Return -+ :class:`cupy.ndarray` or :class:`numpy.array` object -+ """ -+ return self._buffer[device_id] -+ -+ def get_global_slices(self, device_id:int, subarray_index:int) -> SlicesType | None: -+ """ -+ Return global slices of one copy at the device. -+ -+ If the copy is complete, return None -+ """ -+ if self._indices_map[device_id] is None: -+ return None -+ else: -+ slices = [] -+ for device_indices in self._indices_map[device_id][subarray_index]: -+ if isinstance(device_indices, dict): -+ index = list(device_indices.keys()) -+ if len(index) == 1: -+ slices.append(index[0]) -+ else: -+ slices.append(index) -+ else: -+ slices.append(slice(*device_indices)) -+ -+ return tuple(slices) -+ -+ @staticmethod -+ def _map_int_with_int_map(n: int, int_map: Dict[int, int]) -> int | None: -+ """ -+ Find the mapping of `n` in `int_map` -+ -+ if `n` not in `int_map`, return None -+ -+ example: -+ n: 2 -+ int_map: {1:0, 2:1} -+ return: 1 -+ """ -+ return None if n not in int_map else int_map[n] -+ -+ @staticmethod -+ def _map_int_with_slice(n: int, target_slice: tuple) -> int | None: -+ """ -+ Find the mapping of `n` in a `target_slice` (find index of `n` in `target_slice`) -+ `target_slice` is a tuple(begin, end, step) -+ -+ if `n` not in `target_slice`, return None -+ -+ example: -+ n: 2 -+ target_slice: (2, 4, 1) -+ return: 0 -+ """ -+ # TODO: assume slice is simple (no neg value) -+ begin, end, step = target_slice -+ step = 1 if step is None else step -+ -+ # bound checking -+ if n < begin or n >= end: -+ return None -+ if (n - begin) % step != 0: -+ return None -+ -+ return (n - begin) // step -+ -+ @staticmethod -+ def _map_slice_with_slice(input_slice: tuple, target_slice: tuple) -> tuple | None: -+ """ -+ Find the mapping of `input_slice` in a `target_slice` -+ `input_slice` and `target_slice` is a tuple(begin, end, step) -+ -+ if `input_slice` not a subset of `target_slice`, return None -+ -+ example: -+ input_slice: (2, 10, 4) -+ target_slice: (0, 10, 2) -+ return: (1, 5, 2) -+ """ -+ # TODO: assume slice is simple (no neg value) -+ target_begin, target_end, target_step = target_slice -+ target_step = 1 if target_step is None else target_step -+ -+ input_begin, input_end, input_step = input_slice -+ input_step = 1 if input_step is None else input_step -+ -+ mapped_begin = MultiDeviceBuffer._map_int_with_slice( -+ input_begin, target_slice) -+ -+ # get the last possible element in range of `input_slice` -+ # TODO: what if last_element < input_begin ? -+ last_element = input_end - input_step + (input_end - input_begin) % input_step -+ mapped_end = MultiDeviceBuffer._map_int_with_slice(last_element, target_slice) -+ -+ if mapped_begin is None or mapped_end is None: -+ return None -+ -+ # adjust step -+ if input_step % target_step != 0: -+ return None -+ mapped_step = input_step // target_step -+ -+ return mapped_begin, mapped_end + 1, mapped_step # tuple -+ -+ def map_local_slices(self, device_id: int, global_slices: SlicesType) -> (int, SlicesType): -+ """ -+ Map a given global slices to local slices wrt buffer at the device. -+ -+ Raise error if `global_slices` out of range -+ -+ Return subarray_index: the index of subarray in the list of `_buffer[device_id]` -+ local_slices: the local slices which maps to the `global_slices` -+ Note: this method assume a indices mapping exists for this device -+ """ -+ # indexing into the whole array, index of out bound -+ not_tuple = False -+ if not isinstance(global_slices, tuple): # if not a tuple, make it a tuple -+ global_slices = tuple([global_slices]) -+ not_tuple = True -+ -+ local_slices = [] -+ -+ if len(self.shape) < len(global_slices): -+ raise IndexError(f"index out of range, index:{global_slices}") -+ -+ final_subarray_index = 0 -+ -+ for subarray_index in range(len(self._indices_map[device_id])): # for each subarray at this device -+ indices_map = self._indices_map[device_id][subarray_index] -+ -+ for d in range(len(global_slices)): -+ size = self.shape[d] # number of entries at this axis -+ global_index = global_slices[d] -+ index_map = None if d >= len(indices_map) else indices_map[d] -+ -+ if index_map is None: # None means 1:1 map to all elements at this axis -+ local_index = global_index -+ elif isinstance(index_map, dict) and len(index_map) == 1: -+ # special case, this axis was indexed by a int, so -+ # dimension was reduced by 1, -+ # need to ignore this axis, just check index match or not -+ if list(index_map.keys())[0] == global_index: # false if type or value doesn't match -+ continue -+ else: -+ local_index = None -+ elif isinstance(index_map, tuple): -+ if isinstance(global_index, int): # int vs slice -+ local_index = MultiDeviceBuffer._map_int_with_slice(global_index, index_map) -+ elif isinstance(global_index, list): # List[int] vs slice -+ local_index = [MultiDeviceBuffer._map_int_with_slice(i, index_map) for i in global_index] -+ -+ # any index out of bound? -+ if None in local_index: -+ local_index = None -+ elif isinstance(global_index, slice): # slice vs slice -+ # slice to tuple -+ slice_tuple = global_index.indices(size) -+ local_tuple = MultiDeviceBuffer._map_slice_with_slice(slice_tuple, index_map) -+ if local_tuple is None: -+ local_index = None -+ else: -+ local_index = slice(*local_tuple) -+ else: -+ raise IndexError(f"Unsupported slices type: {type(global_index)}") -+ else: # Map is int or list -+ if isinstance(global_index, int): # int vs int/list -+ local_index = self._map_int_with_int_map(global_index, index_map) -+ elif isinstance(global_index, list): # list vs int/list -+ local_index = [self._map_int_with_int_map(i, index_map) for i in global_index] -+ -+ if None in local_index: -+ local_index = None -+ elif isinstance(global_index, slice): # slice vs int/list -+ # slice to tuple -+ slice_tuple = global_index.indices(size) -+ local_index = [self._map_int_with_int_map(i, index_map) for i in range(*slice_tuple)] -+ -+ if None in local_index: -+ local_index = None -+ else: -+ raise IndexError(f"Unsupported slices type {type(global_index)}") -+ -+ # if None, it means index out of range at this axis -+ if local_index is None: -+ # check next copy -+ local_slices = None -+ break -+ -+ local_slices.append(local_index) -+ -+ if local_slices is None: # result is not found for this subarray -+ if subarray_index == len(self._indices_map[device_id]) - 1: # this is the last subarray -+ local_slices = None # non slices is found -+ else: # check next subarray -+ local_slices = [] # clear intermidate result -+ else: -+ final_subarray_index = subarray_index -+ break -+ -+ if local_slices is None: -+ raise IndexError(f"index out of range, index:{global_slices}") -+ elif not_tuple: -+ if len(local_slices) == 0: # only be possible when special case int vs int exists and all axis are ignored -+ return final_subarray_index, slice(None, None, None) -+ else: -+ return final_subarray_index, local_slices[0] -+ else: -+ return final_subarray_index, tuple(local_slices) -+ -+ def set_slices_mapping(self, device_id: int, global_slices: SlicesType): -+ """ -+ set a global slices to local slices mapping wrt buffer at the device. -+ -+ Raise error if `global_slices` is higher dim than shape -+ Note: this call doesn't check slice is within range, if it is not in range -+ exception will be trigger later when trying to index into the copy -+ """ -+ if not isinstance(global_slices, tuple): # if not a tuple, make it a tuple -+ global_slices = tuple([global_slices]) -+ -+ if len(self.shape) < len(global_slices): -+ raise IndexError(f"index out of range, index:{global_slices}") -+ -+ slices_map_list = [] -+ for d in range(len(global_slices)): -+ size = self.shape[d] # number of entries at this axis -+ global_slice = global_slices[d] -+ -+ if isinstance(global_slice, int): # a single integer -+ slice_map = {global_slice: 0} -+ elif isinstance(global_slice, list): # a list of integer -+ slice_map = {global_slice[i]: i for i in range(len(global_slice))} -+ elif isinstance(global_slice, slice): # slice -+ # save slice as a tuple -+ # None in slice will be instantiated by concrete values -+ slice_map = global_slice.indices(size) -+ else: -+ raise IndexError(f"Unsupported slices type {type(global_slice)}") -+ slices_map_list.append(slice_map) -+ -+ if self._indices_map[device_id] is None: -+ self._indices_map[device_id] = [slices_map_list] -+ else: -+ self._indices_map[device_id].append(slices_map_list) -+ -+ def get_by_global_slices(self, device_id: int, global_slices: SlicesType): -+ """ -+ Indexing/Slicing the buffer by `global_slices`. -+ -+ `global_slices` will be first converted into local slices -+ -+ Args: -+ device_id: gpu device_id or CPU_INDEX -+ global_slices: slice/ints/tuple/list, use the same format as advance indexing of numpy -+ -+ Return -+ :class:`cupy.ndarray` or :class:`numpy.array` object or `None` if there is no copy at that device -+ """ -+ # check if a copy exists at this device -+ # -+ # This is needed for the usage of @spawn() annotation -+ # sometimes device has no copy but still need a parray slices view object in advance -+ # Example: -+ # A = parray(numpy_array) -+ # @spawn(inout=[A], placement=gpu(0)) -+ # def task1(): -+ # ... a task move A to GPU. free CPU copy ... -+ # @spawn(inout=[A[0]], placement=gpu(1)) -+ # def task2(): -+ # error: cpu's copy is freed -> `inout=[A[0]]` trigger an exception, -+ # since it try a slice A at CPU (spawn() itself is happened at outer CPU task) -+ if self._buffer[device_id] is None: -+ return None -+ -+ # check if there is a mapping -+ if self._indices_map[device_id] is None: -+ return self._buffer[device_id].__getitem__(global_slices) -+ else: -+ # map global slices to local slices -+ subarray_index, local_slices = self.map_local_slices(device_id, global_slices) -+ return self._buffer[device_id][subarray_index].__getitem__(local_slices) -+ -+ def set_by_global_slices(self, device_id: int, global_slices: SlicesType, value: ndarray | Any): -+ """ -+ Indexing/Slicing the buffer by `global_slices` and set value. -+ -+ `global_slices` will be first converted into local slices -+ -+ Args: -+ device_id: gpu device_id or CPU_INDEX -+ global_slices: slice/ints/tuple/list, use the same format as advance indexing of numpy -+ value: the data to set -+ -+ Return -+ :class:`cupy.ndarray` or :class:`numpy.array` object -+ """ -+ # check if there is a mapping -+ if self._indices_map[device_id] is None: -+ self._buffer[device_id].__setitem__(global_slices, value) -+ else: -+ # map global slices to local slices -+ subarray_index, local_slices = self.map_local_slices(device_id, global_slices) -+ self._buffer[device_id][subarray_index].__setitem__(local_slices, value) -+ -+ -+ def _move_data(self, copy_func, dst: int, src: int, subarray_index: int, dst_slices: SlicesType, src_slices: SlicesType, dst_is_current_device:bool = True): -+ """ -+ Helper function for copy_data_between_device -+ """ -+ if dst_is_current_device: -+ if dst_slices is None and src_slices is None: # Complete to Complete -+ self._buffer[dst] = copy_func(self._buffer[src]) -+ elif dst_slices is None and src_slices is not None: # Incomplete to Complete -+ self._buffer[dst][src_slices] = copy_func(self._buffer[src][subarray_index]) -+ elif dst_slices is not None and src_slices is None: # Complete to incomplete -+ if self._buffer[dst] is None: -+ self._buffer[dst] = [] -+ self._buffer[dst].append(copy_func(self._buffer[src][dst_slices])) -+ else: # incomplete to incomplete -+ raise ValueError("Copy from subarray to subarray is unsupported") -+ else: -+ with cupy.cuda.Device(dst): # switch device -+ if dst_slices is None and src_slices is None: # Complete to Complete -+ self._buffer[dst] = copy_func(self._buffer[src]) -+ elif dst_slices is None and src_slices is not None: # Incomplete to Complete -+ self._buffer[dst][src_slices] = copy_func(self._buffer[src][subarray_index]) -+ elif dst_slices is not None and src_slices is None: # Complete to incomplete -+ if self._buffer[dst] is None: -+ self._buffer[dst] = [] -+ self._buffer[dst].append(copy_func(self._buffer[src][dst_slices])) -+ else: # incomplete to incomplete -+ raise ValueError("Copy from subarray to subarray is unsupported") -+ -+ def copy_data_between_device(self, dst: int, src: int, dst_is_current_device: bool = True) -> None: -+ """ -+ Copy data from src to dst. -+ -+ dst is current device if `dst_is_current_device` is True -+ """ -+ # a function to copy data between GPU devices async -+ def copy_from_device_async(src): -+ dst_data = cupy.empty_like(src) -+ dst_data.data.copy_from_device_async(src.data, src.nbytes) -+ return dst_data -+ -+ if self._indices_map[src] is None: -+ src_slices_list = [None] -+ else: -+ src_slices_list = [self.get_global_slices(src, i) for i in range(len(self._indices_map[src]))] -+ -+ # TRICK: if there are multiple subarray in this device, always pick the last one -+ # this is because load of data always comes together with create indices mapping -+ # so the indices mapping will put at the end of self._indices_map -+ dst_slices = self.get_global_slices(dst, -1) -+ -+ for subarray_index in range(len(src_slices_list)): -+ src_slices = src_slices_list[subarray_index] -+ if src == CPU_INDEX: # copy from CPU to GPU -+ self._move_data(cupy.asarray, dst, src, subarray_index, dst_slices, src_slices, dst_is_current_device) -+ elif dst != CPU_INDEX: # copy from GPU to GPU -+ self._move_data(copy_from_device_async, dst, src, subarray_index, dst_slices, src_slices, dst_is_current_device) -+ else: # copy from GPU to CPU -+ self._move_data(cupy.asnumpy, dst, src, subarray_index, dst_slices, src_slices) # dst_is_current_device is no need if dst is CPU -+ self._cyparray_state.set_exist_on_device(dst, True) -+ -+ def get_slices_hash(self, global_slices: SlicesType) -> int: -+ """ -+ Get hash value of a slices of complete array. -+ -+ This could be done by replaing list and slice to tuple -+ """ -+ # little chance to have collision, but what if it happened? -+ hash_value = 17 # use a none zero hash value, so hash(0) != 0 -+ prime = 31 -+ if not isinstance(global_slices, tuple): -+ if isinstance(global_slices, list): -+ # Built-int hash() method might return negtive value. -+ # c_size_t is to ensure it is not negative -+ hash_value = hash_value * prime + ctypes.c_size_t(hash(tuple(global_slices))).value -+ elif isinstance(global_slices, slice): -+ hash_value = hash_value * prime + ctypes.c_size_t(hash(global_slices.indices(self.shape[0]))).value -+ else: -+ hash_value = hash_value * prime + ctypes.c_size_t(hash(global_slices)).value -+ else: -+ if len(self.shape) < len(global_slices): -+ raise IndexError(f"index out of range, index:{global_slices}") -+ -+ for d in range(len(global_slices)): -+ index = global_slices[d] -+ if isinstance(index, list): -+ hash_value = hash_value * prime + ctypes.c_size_t(hash(tuple(index))).value -+ elif isinstance(index, slice): -+ hash_value = hash_value * prime + ctypes.c_size_t(hash(index.indices(self.shape[d]))).value -+ else: -+ hash_value = hash_value * prime + ctypes.c_size_t(hash(index)).value -+ -+ return hash_value -+ -+ def __str__(self): -+ return str(self._buffer) -+ -+ def __contains__(self, device_id): -+ """ -+ Return True if there is a copy in this device -+ """ -+ return device_id in self._buffer and self._buffer[device_id] is not None -+ -+ def clear(self, device_id) -> None: -+ """ -+ Clear data in device_id -+ """ -+ import gc -+ mempool = cupy.get_default_memory_pool() -+ pinned_mempool = cupy.get_default_pinned_memory_pool() -+ print("inside evict():", mempool.used_bytes(), flush=True) -+ pritn("total:", mempool.total_bytes()) -+ print("device id:", device_id, flush=True) -+#print("inside pinned evict():", pinned_mempool.used_bytes(), flush=True) -+ print("evicting array size:", self._buffer[device_id].nbytes) -+ print("type:", type(self._buffer[device_id])) -+ del self._buffer[device_id] -+ print("After inside evict():", mempool.used_bytes(), flush=True) -+#print("After inside pinned evict():", pinned_mempool.used_bytes(), flush=True) -+ self._indices_map[device_id] = None -+ self._buffer[device_id] = None -+ self._cyparray_state.set_exist_on_device(device_id, False) -diff --git a/src/python/parla/common/parray/core.py b/src/python/parla/common/parray/core.py -index e25396c..8d89075 100644 ---- a/src/python/parla/common/parray/core.py -+++ b/src/python/parla/common/parray/core.py -@@ -120,6 +120,8 @@ class PArray: - # Note(@dialecticDolt):It should be valid to create PArrays outside of a scheduler context!! - # FIXME - -+ # Register this PArray to tracker and make a link between -+ # C PArray instance. - scheduler = get_scheduler() - if scheduler is None: - raise NotImplementedError( -@@ -129,6 +131,7 @@ class PArray: - num_devices = len(scheduler.device_manager.get_all_devices()) - self._cy_parray = CyPArray( - self, self.ID, self.parent_ID, self.parent, self._cyparray_state, num_devices) -+ # record the size in Cython PArray - self._cy_parray.set_size(self.subarray_nbytes) - - target_dev_id = - \ -@@ -301,7 +304,7 @@ class PArray: - f"Parent_ID: {self.parent_ID if self.ID != self.parent_ID else None}, " - f"Slice: {self._slices[0] if self.ID != self.parent_ID else None}, " - f"Bytes: {self.subarray_nbytes}, " -- f"Owner: {'GPU ' + str(self._coherence.owner) if self._coherence.owner != CPU_INDEX else 'CPU'}") -+ f"Owner: {'GPU ' + str(self._coherence.owner) if self._coherence.owner != CPU_INDEX else 'CPU'}", flush=True) - for device_id, state in self._coherence._local_states.items(): - if device_id == CPU_INDEX: - device_name = "CPU" -@@ -311,13 +314,13 @@ class PArray: - - if isinstance(state, dict): - print( -- f"state: {[state_str_map[s] for s in list(state.values())]}, including sliced copy: # states of slices is unordered wrt the below slices") -+ f"state: {[state_str_map[s] for s in list(state.values())]}, including sliced copy: # states of slices is unordered wrt the below slices", flush=True) - for slice, slice_id in zip(self._array._indices_map[device_id], range(len(self._array._indices_map[device_id]))): - print( -- f"\tslice {slice_id} - indices: {slice}, bytes: {self._array._buffer[device_id][slice_id].nbytes}") -+ f"\tslice {slice_id} - indices: {slice}, bytes: {self._array._buffer[device_id][slice_id].nbytes}", flush=True) - else: -- print(f"state: {state_str_map[state]}") -- print("---End of Overview") -+ print(f"state: {state_str_map[state]}", flush=True) -+ print("---End of Overview", flush=True) - - # slicing/indexing - -@@ -395,7 +398,7 @@ class PArray: - - with self._coherence_cv[device_id]: - operations = self._coherence.evict(device_id, keep_one_copy) -- if operations[0].inst == MemoryOperation.ERROR: -+ if len(operations) != 0 and operations[0].inst == MemoryOperation.ERROR: - return False # cannot perform the eviction - self._process_operations(operations) - -@@ -479,11 +482,21 @@ class PArray: - # f"Evicting {self.name} from {op.src}, size: {to_free} bytes", flush=True) - - scheduler = get_scheduler() -- if (to_free > 0) and (scheduler is not None): -- # This frees the memory on the device in the mapped and reserved pools -- scheduler.device_manager.free_memory(op.src, to_free) -- # TODO(wlr): This is only for explictly evicted PArrays. PArrays that fall out of scope need to be freed as well. -- -+ if scheduler is not None: -+ if to_free > 0: -+ # This frees the memory on the device in the mapped and reserved pools -+ scheduler.device_manager.free_memory(op.src, to_free) -+ # TODO(wlr): This is only for explictly evicted PArrays. PArrays that fall out of scope need to be freed as well. -+ src_global_dev_id = -+ scheduler.device_manager.parrayid_to_globalid(op.src) -+ if self._cy_parray.get_num_referring_tasks(src_global_dev_id) == 0: -+ # If none of active tasks refers this PArray, -+ # remove this PArray on the src device from -+ # the PArray tracker's table. -+ scheduler.remove_parray_from_tracker( -+ self._cy_parray, src_global_dev_id) -+ # decrement the reference counter, relying on GC to free the memor -+ self._array.clear(op.src) - elif op.inst == MemoryOperation.ERROR: - raise RuntimeError( - "PArray gets an error from coherence protocol") -@@ -906,5 +919,11 @@ class PArray: - def get_parray_parentid_from_cpp(self): - return self._cy_parray.get_parray_parentid() - -- def get_num_active_tasks(self, global_dev_id): -- return self._cy_parray.get_num_active_tasks(global_dev_id) -+ def get_num_referring_tasks(self, global_dev_id): -+ return self._cy_parray.get_num_referring_tasks(global_dev_id) -+ -+ def __del__(self): -+ # Users can explicitly call `del` over a Python PArray. -+ # In this case, detroy its array instance. -+ # TODO(hc): This code is not tested yet -+ self._array = None -diff --git a/src/python/parla/common/parray/memory.py b/src/python/parla/common/parray/memory.py -index 05cef02..89bf74c 100644 ---- a/src/python/parla/common/parray/memory.py -+++ b/src/python/parla/common/parray/memory.py -@@ -573,3 +573,9 @@ class MultiDeviceBuffer: - self._buffer[device_id] = None - self._cyparray_state.set_exist_on_device(device_id, False) - return to_free -+ -+ def __del__(self): -+ for i in range(0, len(self._buffer)): -+ self._cyparray_state.set_exist_on_device(i, False) -+ self._indices_map = None -+ self._buffer = None -diff --git a/src/python/parla/cython/CMakeLists.txt b/src/python/parla/cython/CMakeLists.txt -index 431fc58..7378df9 100644 ---- a/src/python/parla/cython/CMakeLists.txt -+++ b/src/python/parla/cython/CMakeLists.txt -@@ -14,6 +14,7 @@ add_cython_target(tasks tasks.pyx CXX PY3) - add_cython_target(scheduler scheduler.pyx CXX PY3) - add_cython_target(device device.pyx CXX PY3) - add_cython_target(device_manager device_manager.pyx CXX PY3) -+add_cython_target(mm mm.pyx CXX PY3) - add_cython_target(cyparray_state cyparray_state.pyx CXX PY3) - add_cython_target(cyparray cyparray.pyx CXX PY3) - add_cython_target(variants variants.pyx CXX PY3) -@@ -23,6 +24,7 @@ add_library(tasks MODULE ${tasks}) - add_library(scheduler MODULE ${scheduler}) - add_library(device MODULE ${device}) - add_library(device_manager MODULE ${device_manager}) -+add_library(mm MODULE ${mm}) - add_library(cyparray_state MODULE ${cyparray_state}) - add_library(cyparray MODULE ${cyparray}) - add_library(variants MODULE ${variants}) -@@ -37,6 +39,7 @@ install(TARGETS tasks LIBRARY DESTINATION ${PYTHON_RELATIVE_SITE_PACKAGES_DIR}/p - install(TARGETS scheduler LIBRARY DESTINATION ${PYTHON_RELATIVE_SITE_PACKAGES_DIR}/parla/cython/) - install(TARGETS device LIBRARY DESTINATION ${PYTHON_RELATIVE_SITE_PACKAGES_DIR}/parla/cython/) - install(TARGETS device_manager LIBRARY DESTINATION ${PYTHON_RELATIVE_SITE_PACKAGES_DIR}/parla/cython/) -+install(TARGETS mm LIBRARY DESTINATION ${PYTHON_RELATIVE_SITE_PACKAGES_DIR}/parla/cython/) - install(TARGETS cyparray_state LIBRARY DESTINATION ${PYTHON_RELATIVE_SITE_PACKAGES_DIR}/parla/cython/) - install(TARGETS cyparray LIBRARY DESTINATION ${PYTHON_RELATIVE_SITE_PACKAGES_DIR}/parla/cython/) - install(TARGETS variants LIBRARY DESTINATION ${PYTHON_RELATIVE_SITE_PACKAGES_DIR}/parla/cython/) -@@ -89,6 +92,14 @@ target_include_directories(device_manager PUBLIC ${NumPy_INCLUDE_DIRS}) - target_include_directories(device_manager PUBLIC ${PYTHON_INCLUDE_DIRS}) - target_include_directories(device_manager PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) - -+target_link_libraries(mm ${PYTHON_LIBRARIES}) -+target_link_libraries(mm backend) -+target_include_directories(mm PUBLIC ${PROJECT_SOURCE_DIR}/src/c/backend) -+target_include_directories(mm PUBLIC ${PROJECT_SOURCE_DIR}/src/c/backend/include) -+target_include_directories(mm PUBLIC ${NumPy_INCLUDE_DIRS}) -+target_include_directories(mm PUBLIC ${PYTHON_INCLUDE_DIRS}) -+target_include_directories(mm PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) -+ - target_link_libraries(cyparray_state ${PYTHON_LIBRARIES}) - target_link_libraries(cyparray_state backend) - target_include_directories(cyparray_state PUBLIC ${PROJECT_SOURCE_DIR}/src/c/backend) -@@ -119,6 +130,7 @@ python_extension_module(tasks) - python_extension_module(scheduler) - python_extension_module(device) - python_extension_module(device_manager) -+python_extension_module(mm) - python_extension_module(cyparray_state) - python_extension_module(cyparray) - python_extension_module(variants) -diff --git a/src/python/parla/cython/core.pxd b/src/python/parla/cython/core.pxd -index 9643d9f..76a3f75 100644 ---- a/src/python/parla/cython/core.pxd -+++ b/src/python/parla/cython/core.pxd -@@ -4,6 +4,7 @@ cimport cython - from parla.cython.device_manager cimport DeviceManager - from parla.cython.device cimport Device, CyDevice - from parla.cython.cyparray cimport InnerPArray -+from parla.cython.mm cimport LRUGlobalEvictionManager - - from libc.stdint cimport uint32_t, uint64_t, int64_t - from libcpp cimport bool -@@ -22,6 +23,7 @@ cdef extern from "include/gpu_utility.hpp" nogil: - void gpu_busy_sleep(const int device, const unsigned long cycles, - uintptr_t stream_ptr) - -+ - cdef extern from "include/runtime.hpp" nogil: - ctypedef void (*launchfunc_t)(void* py_scheduler, void* py_task, void* py_worker) - ctypedef void (*stopfunc_t)(void*) -@@ -130,15 +132,19 @@ cdef extern from "include/runtime.hpp" nogil: - - bool should_run - -- InnerScheduler(DeviceManager* cpp_device_manager) -+ InnerScheduler(LRUGlobalEvictionManager* cpp_memory_manager, DeviceManager* cpp_device_manager) - - void set_num_workers(int num_workers) - void set_py_scheduler(void* py_scheduler) - void set_stop_callback(stopfunc_t func) - -+ bool get_should_run() -+ - void run() except + - void stop() - -+ long long int get_memory_size_to_evict(int dev_id) except + -+ - void activate_wrapper() - - void spawn_task(InnerTask* task) -@@ -149,7 +155,6 @@ cdef extern from "include/runtime.hpp" nogil: - void task_cleanup_presync(InnerWorker* worker, InnerTask* task, int state) except + - void task_cleanup_postsync(InnerWorker* worker, InnerTask* task, int state) except + - -- int get_num_active_tasks() - void increase_num_active_tasks() - void decrease_num_active_tasks() - -@@ -168,7 +173,10 @@ cdef extern from "include/runtime.hpp" nogil: - void spawn_wait() - - void create_parray(InnerPArray* parray, int parray_dev_id) -+ void remove_parray_from_tracker(InnerPArray* parray, int dev_id) - -+ void invoke_all_cparrays_clear() -+ bool get_all_pyparrays_clear_flag() - - - cdef extern from "include/profiling.hpp" nogil: -diff --git a/src/python/parla/cython/core.pyx b/src/python/parla/cython/core.pyx -index d2ac9f9..68891c5 100644 ---- a/src/python/parla/cython/core.pyx -+++ b/src/python/parla/cython/core.pyx -@@ -12,6 +12,7 @@ from parla.common.globals import AccessMode - from parla.cython.device cimport Device - from parla.cython.cyparray cimport CyPArray - from parla.cython.device_manager cimport CyDeviceManager, DeviceManager -+from parla.cython.mm cimport CyMM - import threading - from enum import IntEnum, auto - from parla.common.globals import cupy -@@ -585,14 +586,15 @@ cdef class PyInnerWorker: - cdef class PyInnerScheduler: - cdef InnerScheduler* inner_scheduler - -- def __cinit__(self, CyDeviceManager cy_device_manager, int num_workers, float vcus, object python_scheduler): -+ def __cinit__(self, CyMM cy_memory_manager, CyDeviceManager cy_device_manager, int num_workers, float vcus, object python_scheduler): - cdef InnerScheduler* _inner_scheduler - cdef DeviceManager* _cpp_device_manager = cy_device_manager.get_cpp_device_manager() -+ cdef LRUGlobalEvictionManager* _cpp_memory_manager = cy_memory_manager.get_cpp_memory_manager() - -- _inner_scheduler = new InnerScheduler(_cpp_device_manager) -+ _inner_scheduler = new InnerScheduler(_cpp_memory_manager, _cpp_device_manager) - self.inner_scheduler = _inner_scheduler - -- def __init__(self, CyDeviceManager cy_device_manager, int num_workers, float vcus, object python_scheduler): -+ def __init__(self, CyMM cy_memory_manager, CyDeviceManager cy_device_manager, int num_workers, float vcus, object python_scheduler): - cdef InnerScheduler* _inner_scheduler - _inner_scheduler = self.inner_scheduler - -@@ -610,6 +612,14 @@ cdef class PyInnerScheduler: - def __dealloc__(self): - del self.inner_scheduler - -+ cpdef get_should_run(self): -+ """ -+ This function checks whether there are remaining tasks -+ in C scheduler queues. -+ """ -+ cdef InnerScheduler* c_self = self.inner_scheduler -+ return c_self.get_should_run() -+ - cpdef run(self): - cdef InnerScheduler* c_self = self.inner_scheduler - with nogil: -@@ -661,10 +671,6 @@ cdef class PyInnerScheduler: - with nogil: - c_self.task_cleanup_postsync(c_worker, c_task, state) - -- cpdef get_num_active_tasks(self): -- cdef InnerScheduler* c_self = self.inner_scheduler -- return c_self.get_num_active_tasks() -- - cpdef increase_num_active_tasks(self): - cdef InnerScheduler* c_self = self.inner_scheduler - c_self.increase_num_active_tasks() -@@ -720,6 +726,18 @@ cdef class PyInnerScheduler: - cdef InnerScheduler* c_self = self.inner_scheduler - return c_self.get_reserved_memory(global_dev_id) - -+ cpdef get_memory_size_to_evict(self, int global_dev_id): -+ cdef InnerScheduler* c_self = self.inner_scheduler -+ return c_self.get_memory_size_to_evict(global_dev_id) -+ -+ cpdef invoke_all_cparrays_clear(self): -+ cdef InnerScheduler* c_self = self.inner_scheduler -+ c_self.invoke_all_cparrays_clear() -+ -+ cpdef get_all_pyparrays_clear_flag(self): -+ cdef InnerScheduler* c_self = self.inner_scheduler -+ return c_self.get_all_pyparrays_clear_flag() -+ - - class Resources: - -diff --git a/src/python/parla/cython/cyparray.pxd b/src/python/parla/cython/cyparray.pxd -index 73223bc..06e7422 100644 ---- a/src/python/parla/cython/cyparray.pxd -+++ b/src/python/parla/cython/cyparray.pxd -@@ -12,12 +12,12 @@ cdef extern from "include/parray.hpp" namespace "parray": - cdef cppclass InnerPArray: - InnerPArray(void *, uint64_t, uint64_t, InnerPArray *, PArrayState *i, uint32_t) except + - void set_size(uint64_t) -- uint64_t get_num_active_tasks(uint32_t global_dev_id) except + -+ uint64_t get_num_referring_tasks(uint32_t global_dev_id) except + - const uint64_t get_parent_id() except + - - cdef class CyPArray: - # Hold a C++ instance which we're wrapping - cdef InnerPArray* cpp_parray - cdef InnerPArray* get_cpp_parray(self) -- cpdef get_num_active_tasks(self, int global_dev_id) -+ cpdef get_num_referring_tasks(self, int global_dev_id) - cpdef get_parray_parentid(self) -diff --git a/src/python/parla/cython/cyparray.pyx b/src/python/parla/cython/cyparray.pyx -index 19e9fee..ad13d6a 100644 ---- a/src/python/parla/cython/cyparray.pyx -+++ b/src/python/parla/cython/cyparray.pyx -@@ -37,8 +37,8 @@ cdef class CyPArray: - cdef InnerPArray* get_cpp_parray(self): - return self.cpp_parray - -- cpdef get_num_active_tasks(self, int global_dev_id): -- return self.cpp_parray.get_num_active_tasks(global_dev_id) -+ cpdef get_num_referring_tasks(self, int global_dev_id): -+ return self.cpp_parray.get_num_referring_tasks(global_dev_id) - - cpdef get_parray_parentid(self): - return self.cpp_parray.get_parent_id() -diff --git a/src/python/parla/cython/mm.pxd b/src/python/parla/cython/mm.pxd -new file mode 100644 -index 0000000..8a8670a ---- /dev/null -+++ b/src/python/parla/cython/mm.pxd -@@ -0,0 +1,13 @@ -+from parla.cython.device_manager cimport DeviceManager -+ -+cdef extern from "include/memory_manager.hpp" nogil: -+ cdef cppclass LRUGlobalEvictionManager: -+ LRUGlobalEvictionManager(DeviceManager *) -+ unsigned long long size(unsigned int device_id) -+ void *remove_and_return_head_from_zrlist(unsigned int device_id) -+ -+cdef class CyMM: -+ cdef LRUGlobalEvictionManager* _inner_mm -+ cpdef size(self, int dev_id) -+ cpdef remove_and_return_head_from_zrlist(self, int dev_id) -+ cdef LRUGlobalEvictionManager* get_cpp_memory_manager(self) -diff --git a/src/python/parla/cython/mm.pyx b/src/python/parla/cython/mm.pyx -new file mode 100644 -index 0000000..268e7cf ---- /dev/null -+++ b/src/python/parla/cython/mm.pyx -@@ -0,0 +1,62 @@ -+ -+from parla.cython import device_manager -+ -+from parla.cython.core cimport LRUGlobalEvictionManager -+from parla.cython cimport device_manager -+#from parla.cython.core import LRUGlobalEvictionManager -+ -+class PyMM: -+ def __init__(self, dm: device_manager.PyDeviceManager): -+ self._device_manager = device_manager -+ self._cy_mm = CyMM(dm.get_cy_device_manager()) -+ -+ def size(self, dev_id: int): -+ return self._cy_mm.size(dev_id) -+ -+ def remove_and_return_head_from_zrlist(self, dev_id: int): -+ return self._cy_mm.remove_and_return_head_from_zrlist(dev_id) -+ -+ def get_cy_memory_manager(self): -+ return self._cy_mm -+ -+ def print_memory_stats(self, device_id, label: str): -+ import psutil -+ import os -+ print(f"[{label}] Memory tracking", flush=True) -+ try: -+ import cupy -+ mempool = cupy.get_default_memory_pool() -+ pinned_mempool = cupy.get_default_pinned_memory_pool() -+ print(( -+ f"\t GPU{device_id} {label} CuPy used bytes: {mempool.used_bytes()} \n" -+ f"\t GPU{device_id} {label} Free bytes: {mempool.free_bytes()} \n" -+ f"\t GPU{device_id} {label} Total bytes: {mempool.total_bytes()} \n"), flush=True) -+ except ImportError: -+ print("MM tracker only supports CuPy memory status checking.", flush=True) -+ -+ -+cdef class CyMM: -+ -+ def __cinit__(self, device_manager.CyDeviceManager cy_dm): -+ self._inner_mm = new LRUGlobalEvictionManager(cy_dm.get_cpp_device_manager()) -+ -+ def __dealloc__(self): -+ del self._inner_mm -+ -+ cpdef size(self, int dev_id): -+ cdef LRUGlobalEvictionManager* c_self = self._inner_mm -+ return c_self.size(dev_id) -+ -+ cpdef remove_and_return_head_from_zrlist(self, int dev_id): -+ cdef LRUGlobalEvictionManager* c_self = self._inner_mm -+ cdef void* py_parray = c_self.remove_and_return_head_from_zrlist(dev_id) -+ if py_parray == NULL: -+ # TODO(hc): This path is actually not used. -+ # It would be great if we can check if this python object is valid -+ # at here; it can simplify our current mechanism a lot. -+ return None -+ else: -+ return py_parray -+ -+ cdef LRUGlobalEvictionManager* get_cpp_memory_manager(self): -+ return self._inner_mm -diff --git a/src/python/parla/cython/scheduler.pyx b/src/python/parla/cython/scheduler.pyx -index 3390240..db2caae 100644 ---- a/src/python/parla/cython/scheduler.pyx -+++ b/src/python/parla/cython/scheduler.pyx -@@ -21,6 +21,7 @@ from parla.cython import tasks - cimport core - from parla.cython import core - from parla.cython.cyparray import CyPArray -+from parla.cython.mm import PyMM - - from parla.common.globals import _Locals as Locals - from parla.common.globals import USE_PYTHON_RUNAHEAD, _global_data_tasks, PREINIT_THREADS -@@ -231,11 +232,11 @@ class WorkerThread(ControllableThread, SchedulerContext): - #print("Setting environment for task", active_task, flush=True) - active_task.environment = device_context - -- - #Writes all 'default' streams and event pointers to c++ task - #This allows their synchronization without the GIL and faster iteration over them - #(only saves initial runtime ones, TODO(wlr): save any user added events or streams after body returns) - device_context.write_to_task(active_task) -+ - #print("Wrote enviornment to task", active_task, flush=True) - - #handle event wait in python -@@ -294,6 +295,7 @@ class WorkerThread(ControllableThread, SchedulerContext): - elif isinstance(final_state, tasks.TaskRunahead): - core.binlog_2("Worker", "Runahead task: ", active_task.inner_task, " on worker: ", self.inner_worker) - -+ #TODO(wlr): Add better exception handling - #print("Cleaning up Task", active_task, flush=True) - - if USE_PYTHON_RUNAHEAD: -@@ -314,13 +316,13 @@ class WorkerThread(ControllableThread, SchedulerContext): - if isinstance(final_state, tasks.TaskRunahead): - final_state = tasks.TaskCompleted(final_state.return_value) - active_task.cleanup() -- - core.binlog_2("Worker", "Completed task: ", active_task.inner_task, " on worker: ", self.inner_worker) - - # print("Finished Task", active_task, flush=True) - active_task.state = final_state - self.task = None - -+ self.task = None - nvtx.pop_range(domain="Python Runtime") - elif self._should_run: - raise WorkerThreadException("%r Worker: Woke without a task", self.index) -@@ -350,7 +352,7 @@ class WorkerThread(ControllableThread, SchedulerContext): - - class Scheduler(ControllableThread, SchedulerContext): - -- def __init__(self, device_manager, n_threads=6, period=0.001): -+ def __init__(self, memory_manager, device_manager, n_threads=6, period=0.001): - super().__init__() - - self.start_monitor = threading.Condition(threading.Lock()) -@@ -364,9 +366,28 @@ class Scheduler(ControllableThread, SchedulerContext): - #TODO: Handle resources better - resources = 1.0 - -+ self.memory_manager = memory_manager - self.device_manager = device_manager -+ cy_memory_manager = self.memory_manager.get_cy_memory_manager() - cy_device_manager = self.device_manager.get_cy_device_manager() -- self.inner_scheduler = PyInnerScheduler(cy_device_manager, n_threads, resources, self) -+ self.inner_scheduler = PyInnerScheduler(cy_memory_manager, -+ cy_device_manager, -+ n_threads, -+ resources, self) -+ # This holds PArray references. -+ # Through this, it makes a scheduler control a PArray's life cycle. -+ # For example, this holds the last reference of a PArray and so, -+ # scheduler (or memory manager) can have the control of thats -+ # deallocation. -+ # TODO(hc): However, for now, we only support reset this dictionary -+ # not deallocating a single PArray. -+ # For now, instead we deallocate PArray instance by -+ # removing the internal array reference through evict(). -+ # In the future, we will have better design for this. -+ self.active_parrays = {} -+ # Worker threads and a scheduler both can access the active_parrays -+ # and so we need a lock to guard that. -+ self.active_parrays_monitor = threading.Condition(threading.Lock()) - - self.worker_threads = [WorkerThread(self, i) for i in range(n_threads)] - -@@ -382,6 +403,58 @@ class Scheduler(ControllableThread, SchedulerContext): - def scheduler(self): - return self - -+ def append_active_parray(self, parray: PArray): -+ """ Append a PArray reference. -+ -+ :param parray: PArray to be appended -+ """ -+ with self.active_parrays_monitor: -+ self.active_parrays[parray.ID] = parray -+ -+ def remove_active_parray(self, parray: PArray): -+ """ Remove a PArray reference. -+ -+ :param parray: PArray to be removed -+ """ -+ with self.active_parrays_monitor: -+ self.active_parrays[parray.ID] = None -+ -+ def clear_active_parrays(self): -+ """ Clear all references from active_parrays dictionary. -+ """ -+ with self.active_parrays_monitor: -+ import cupy -+ for k in range(0, 4): -+ with cupy.cuda.Device(k): -+ mempool = cupy.get_default_memory_pool() -+ print(f"\t {k} Used: {mempool.used_bytes()}, Free: {mempool.free_bytes()}", flush=True) -+ # TODO(hc): This is unstable hack. -+ # This loop is necessary since a worker thread -+ # can proceed with the next task graph -+ # (in case of benchmark/python/benchmark.py) -+ # and generate and allocate new data while a scheduler -+ # who is the caller of this function holds its reference. -+ # It seems like there is a concurrency issue; for example, -+ # removing active_parrays' reference is not immediately -+ # caught by a gc. The safest way that I am using is to evict -+ # each of PArrays at here. -+ # But still it is unstable way and I will replace this soon. -+ for k, v in self.active_parrays.items(): -+ for dev in self.device_manager.get_all_devices(): -+ global_dev_id = dev.get_global_id() -+ parray_dev_id = self.device_manager.globalid_to_parrayid(global_dev_id) -+ v.evict(parray_dev_id) -+ # TODO(hc): I am not sure why the above loop is necessary.. -+ # But otherwise, it doesnt deallocate cupy arrays. -+ # TODO(hc): This is not considering concurrent execution between -+ # a thread and a scheduler.. -+ # The main thread can allocate new data while or before the scheduler -+ # deallocates the old PArrays through this. -+ for k in range(0, 4): -+ with cupy.cuda.Device(k): -+ mempool = cupy.get_default_memory_pool() -+ print(f"\t After {k} Used: {mempool.used_bytes()}, Free: {mempool.free_bytes()}", flush=True) -+ - def get_device_reqs_from_placement(self, placement, vcus, memory): - return self.device_manager.get_device_reqs_from_placement(placement, vcus, memory) - -@@ -414,10 +487,73 @@ class Scheduler(ControllableThread, SchedulerContext): - pass - #print("Runtime Stopped", flush=True) - -+ def parray_eviction(self): -+ py_mm = self.memory_manager -+ print("Eviction policy is activated") -+ for cuda_device in self.device_manager.get_devices(DeviceType.CUDA): -+ global_id = cuda_device.get_global_id() -+ parray_id = self.device_manager.globalid_to_parrayid(global_id) -+ # Get target memory size to evict from this device -+ memory_size_to_evict = \ -+ self.inner_scheduler.get_memory_size_to_evict(global_id) -+ # Get the number of PArray candidates that are allowed to be evicted -+ # from Python eviction manager. -+ num_evictable_parray = py_mm.size(global_id) -+ # TODO(hc): remove this. this is for test. -+ import cupy -+ for i in range(0, num_evictable_parray): -+ try: -+ # Get a PArray from a memory manager to evict. -+ evictable_parray = \ -+ py_mm.remove_and_return_head_from_zrlist(global_id) -+ if evictable_parray is not None: -+ # TODO(hc): remove this. this is for test. -+ for k in range(0, 4): -+ with cupy.cuda.Device(k): -+ mempool = cupy.get_default_memory_pool() -+ print(f"\t OK? {k} Used GPU{k}: {mempool.used_bytes()}, Free Mmeory: {mempool.free_bytes()}", flush=True) -+ -+ evictable_parray.evict(parray_id) -+ -+ # TODO(hc): remove this. this is for test. -+ for k in range(0, 4): -+ with cupy.cuda.Device(k): -+ mempool = cupy.get_default_memory_pool() -+ print(f"\t OK {k} Used GPU{k}: {mempool.used_bytes()}, Free Mmeory: {mempool.free_bytes()}", flush=True) -+ -+ # Repeat eviction until it gets enough memory. -+ memory_size_to_evict -= \ -+ evictable_parray.nbytes_at(parray_id) -+ if memory_size_to_evict <= 0: -+ break -+ except Exception as e: -+ print("Failed to find parray evictable", flush=True) -+ return -+ - def run(self): -- #print("Scheduler: Running", flush=True) -- self.inner_scheduler.run() -- #print("Scheduler: Stopped Loop", flush=True) -+ with self: -+ while True: -+ print("Scheduler: Running", flush=True) -+ self.inner_scheduler.run() -+ if self.inner_scheduler.get_all_pyparrays_clear_flag(): -+ # All the references of the PArrays held by -+ # a Python scheduler should be destroyed -+ # AFTER C++ scheduler (or memory manager) clears -+ # all PArray nodes from the list in the eviction manager. -+ # TODO(hc): rename the function to is_cparrays_cleared. -+ # TODO(hc): this might be done by a worker thread -+ # who will allocate PArrays. -+ # Otherwise, there should be thread concurrent -+ # over memory allocation/deallocation. -+ self.clear_active_parrays() -+ else: -+ should_run = self.inner_scheduler.get_should_run() -+ if should_run == False: -+ break -+ # This case is executed if PArray eviction -+ # mechanism was invoked by C++ scheduler. -+ self.parray_eviction() -+ self.stop_callback() - - def stop(self): - #print("Scheduler: Stopping (Called from Python)", flush=True) -@@ -437,7 +573,6 @@ class Scheduler(ControllableThread, SchedulerContext): - def spawn_task(self, task): - #print("Scheduler: Spawning Task", task, flush=True) - self.inner_scheduler.spawn_task(task.inner_task) -- - - def assign_task(self, task, worker): - task.state = tasks.TaskRunning(task.func, task.args, task.dependencies) -@@ -449,7 +584,6 @@ class Scheduler(ControllableThread, SchedulerContext): - def spawn_wait(self): - self.inner_scheduler.spawn_wait() - -- - def create_parray(self, cy_parray: CyPArray, parray_dev_id: int): - """ - Reserve PArray instances that are created through -@@ -514,6 +648,20 @@ class Scheduler(ControllableThread, SchedulerContext): - return self.inner_scheduler.get_reserved_parray_state( \ - global_dev_id, parray_parent_id) - -+ def remove_parray_from_tracker(\ -+ self, cy_parray: CyPArray, did: int): -+ """ -+ Remove the evicted PArray instance on device `global_dev_id` -+ from the PArray tracker's table -+ -+ :param cy_parray: Cython PArray instance to be removed -+ :param did: global logical device id where the PArray is evicted -+ """ -+ self.inner_scheduler.remove_parray_from_tracker(cy_parray, did) -+ -+ def invoke_all_cparrays_clear(self): -+ self.inner_scheduler.invoke_all_cparrays_clear() -+ - - def _task_callback(task, body): - """ -diff --git a/src/python/parla/cython/tasks.pyx b/src/python/parla/cython/tasks.pyx -index 98bd759..bd71e44 100644 ---- a/src/python/parla/cython/tasks.pyx -+++ b/src/python/parla/cython/tasks.pyx -@@ -21,6 +21,8 @@ from parla.common.globals import AccessMode, Storage - from parla.cython.cyparray import CyPArray - from parla.common.parray.core import PArray - from parla.common.globals import SynchronizationType as SyncType -+from parla.common.globals import _global_data_tasks -+ - - PyDevice = device.PyDevice - PyCUDADevice = device.PyCUDADevice -@@ -554,18 +556,25 @@ class Task: - cy_parray = in_parray.cy_parray - self.inner_task.add_parray(cy_parray, - AccessMode.IN, in_parray_devid) -+ # Add a PArray reference to a dictionary in a scheduler -+ # to pass its lifecycle. -+ self.scheduler.append_active_parray(in_parray) - for out_parray_tpl in dataflow.output: - out_parray = out_parray_tpl[0] - out_parray_devid = out_parray_tpl[1] - cy_parray = out_parray.cy_parray - self.inner_task.add_parray(cy_parray, - AccessMode.OUT, out_parray_devid) -+ self.scheduler.append_active_parray(out_parray) - for inout_parray_tpl in dataflow.inout: - inout_parray = inout_parray_tpl[0] - inout_parray_devid = inout_parray_tpl[1] - cy_parray = inout_parray.cy_parray - self.inner_task.add_parray(cy_parray, - AccessMode.INOUT, inout_parray_devid) -+ # TODO(hc): Maybe we can pass dataflow to reduce -+ # lock conflicts. -+ self.scheduler.append_active_parray(inout_parray) - - def notify_dependents_wrapper(self): - """! -@@ -713,6 +722,7 @@ class DataMovementTask(Task): - idx=0, state=TaskCreated(), scheduler=None, name=None): - super().__init__(taskspace, idx, state, scheduler, name) - self.parray = parray -+ - self.access_mode = access_mode - self.assigned_devices = assigned_devices - -@@ -748,7 +758,6 @@ class DataMovementTask(Task): - self.parray._auto_move(device_manager.get_parray_id(global_device_id), - write_flag) - """ --#self.parray._auto_move(device_manager.get_parray_id(self.dev_id), write_flag) - target_dev = self.assigned_devices[0] - global_id = target_dev.get_global_id() - parray_id = device_manager.globalid_to_parrayid(global_id) -@@ -758,7 +767,8 @@ class DataMovementTask(Task): - return TaskRunahead(0) - - def cleanup(self): -- pass -+ _global_data_tasks[id(self)] = None -+ self.parray = None - - ###### - # Task Environment -@@ -1828,9 +1838,3 @@ class BackendTaskSpace(TaskSpace): - - def wait(self): - self.inner_space.wait() -- -- -- -- -- -- -diff --git a/src/python/parla/utility/execute.py b/src/python/parla/utility/execute.py -index 36825aa..eadcaaf 100644 ---- a/src/python/parla/utility/execute.py -+++ b/src/python/parla/utility/execute.py -@@ -261,15 +261,16 @@ def synthetic_kernel_gpu(total_time: int, gil_fraction: Union[Fraction, float], - return None - - --def create_task_no_data(task, taskspaces, config, data_list=None): -+def create_task_no_data(task, taskspaces, config, ts_postfix=None, data_list=None): - - try: - # Task ID - task_idx = task.task_id.task_idx -- taskspace = taskspaces[task.task_id.taskspace] -+ ts_postfix_str = "" if ts_postfix is None else ts_postfix -+ taskspace = taskspaces[task.task_id.taskspace + ts_postfix_str] - - # Dependency Info -- dependencies = [taskspaces[dep.taskspace][dep.task_idx] -+ dependencies = [taskspaces[dep.taskspace + ts_postfix_str][dep.task_idx] - for dep in task.task_dependencies] - - # Valid Placement Set -@@ -326,15 +327,16 @@ def create_task_no_data(task, taskspaces, config, data_list=None): - return - - --def create_task_eager_data(task, taskspaces, config=None, data_list=None): -+def create_task_eager_data(task, taskspaces, config=None, ts_postfix=None, data_list=None): - - try: - # Task ID - task_idx = task.task_id.task_idx -- taskspace = taskspaces[task.task_id.taskspace] -+ ts_postfix_str = "" if ts_postfix is None else ts_postfix -+ taskspace = taskspaces[task.task_id.taskspace + ts_postfix_str] - - # Dependency Info -- dependencies = [taskspaces[dep.taskspace][dep.task_idx] -+ dependencies = [taskspaces[dep.taskspace + ts_postfix_str][dep.task_idx] - for dep in task.task_dependencies] - - # Valid Placement Set -@@ -397,14 +399,15 @@ def create_task_eager_data(task, taskspaces, config=None, data_list=None): - if config.gil_fraction is not None: - gil_fraction = config.gil_fraction - -- # print("Eager data in:", IN, " out:", OUT, " inout:", INOUT, flush=True) - """ -+ print("Eager data in:", IN, " out:", OUT, " inout:", INOUT, flush=True) - print("task idx:", task_idx, " dependencies:", dependencies, " vcu:", device_fraction, - " placement:", placement_set) -- """ -- - # TODO(hc): Add data checking. -- @spawn(taskspace[task_idx], dependencies=dependencies, vcus=device_fraction, placement=[placement_set], input=IN, output=OUT, inout=INOUT) -+ print("Memory:", IN[0][0].nbytes) -+ """ -+ # TODO(hc): remove memory operand -+ @spawn(taskspace[task_idx], dependencies=dependencies, vcus=device_fraction, placement=[placement_set], input=IN, output=OUT, inout=INOUT, memory=IN[0][0].nbytes) - async def task_func(): - if config.verbose: - print(f"+{task.task_id} Running", flush=True) -@@ -421,15 +424,16 @@ def create_task_eager_data(task, taskspaces, config=None, data_list=None): - return - - --def create_task_lazy_data(task, taskspaces, config=None, data_list=None): -+def create_task_lazy_data(task, taskspaces, config=None, ts_postfix=None, data_list=None): - - try: - # Task ID - task_idx = task.task_id.task_idx -- taskspace = taskspaces[task.task_id.taskspace] -+ ts_postfix_str = "" if ts_postfix is None else ts_postfix -+ taskspace = taskspaces[task.task_id.taskspace + ts_postfix_str] - - # Dependency Info -- dependencies = [taskspaces[dep.taskspace][dep.task_idx] -+ dependencies = [taskspaces[dep.taskspace + ts_postfix_str][dep.task_idx] - for dep in task.task_dependencies] - - # Valid Placement Set -@@ -483,8 +487,11 @@ def create_task_lazy_data(task, taskspaces, config=None, data_list=None): - - if config.gil_fraction is not None: - gil_fraction = config.gil_fraction -+ """ - print("task idx:", task_idx, " dependencies:", dependencies, " vcu:", device_fraction, - " placement:", placement_set) -+ -+ """ - - @spawn(taskspace[task_idx], dependencies=dependencies, vcus=device_fraction, placement=[placement_set]) - async def task_func(): -@@ -520,7 +527,7 @@ def create_task_lazy_data(task, taskspaces, config=None, data_list=None): - return - - --def execute_tasks(taskspaces, tasks: Dict[TaskID, TaskInfo], run_config: RunConfig, data_list=None): -+def execute_tasks(taskspaces, tasks: Dict[TaskID, TaskInfo], run_config: RunConfig, ts_postfix=None, data_list=None): - - spawn_start_t = time.perf_counter() - -@@ -530,32 +537,36 @@ def execute_tasks(taskspaces, tasks: Dict[TaskID, TaskInfo], run_config: RunConf - if run_config.movement_type == MovementType.NO_MOVEMENT: - # print("No data movement") - create_task_no_data(details, taskspaces, -- config=run_config, data_list=data_list) -+ config=run_config, -+ ts_postfix=ts_postfix, -+ data_list=data_list) - elif run_config.movement_type == MovementType.EAGER_MOVEMENT: - # print("Eager data movement") - create_task_eager_data(details, taskspaces, -- config=run_config, data_list=data_list) -+ config=run_config, -+ ts_postfix=ts_postfix, -+ data_list=data_list) - elif run_config.movement_type == MovementType.LAZY_MOVEMENT: - # print("Lazy data movement") - create_task_lazy_data(details, taskspaces, -- config=run_config, data_list=data_list) -- -+ config=run_config, -+ ts_postfix=ts_postfix, -+ data_list=data_list) - spawn_end_t = time.perf_counter() - - return taskspaces - - - def execute_graph(data_config: Dict[int, DataInfo], tasks: Dict[TaskID, TaskInfo], run_config: RunConfig, timing: List[TimeSample]): -- - @spawn(vcus=0) - async def main_task(): - - graph_times = [] -+ # Generate data once for multiple iterations. -+ data_list = generate_data( -+ data_config, run_config.data_scale, run_config.movement_type) - - for i in range(run_config.inner_iterations): -- data_list = generate_data( -- data_config, run_config.data_scale, run_config.movement_type) -- - # Initialize task spaces - taskspaces = {} - -@@ -570,10 +581,76 @@ def execute_graph(data_config: Dict[int, DataInfo], tasks: Dict[TaskID, TaskInfo - - for taskspace in taskspaces.values(): - await taskspace -+ graph_end_t = time.perf_counter() -+ -+ #worker_thread = get_scheduler_context() -+ #worker_thread.scheduler.invoke_all_cparrays_clear() -+ -+ graph_elapsed = graph_end_t - graph_start_t -+ graph_times.append(graph_elapsed) -+ -+ graph_times = np.asarray(graph_times) -+ graph_t = TimeSample(np.mean(graph_times), np.median(graph_times), np.std( -+ graph_times), np.min(graph_times), np.max(graph_times), len(graph_times)) -+ -+ timing.append(graph_t) -+ -+ -+def execute_graph_memory2(data_config: Dict[int, DataInfo], tasks: Dict[TaskID, TaskInfo], run_config: RunConfig, timing: List[TimeSample]): -+ """ -+ This function creates data for each iteration and intentionally decreases -+ its reference count. -+ Data generated in the previous iteration becomes unnecessary and a -+ scheduler's eviction manager evicts those data to CPU. -+ This function requires the following consideration; -+ data should be less than CPU memory. So, the number of iterations or -+ PArray size should not be high. -+ TODO(hc): it might be a separate test. -+ """ -+ @spawn(vcus=0) -+ async def main_task(): -+ -+ graph_times = [] -+ data_list = [] -+ max_memory = 0 -+ for i in range(0, run_config.inner_iterations): -+ import cupy -+ # Initialize task spaces -+ taskspaces = {} -+ -+ data_list.append(generate_data(data_config, run_config.data_scale, run_config.movement_type)) -+ ts_postfix = "-"+str(i) -+ # Create a task space with the postfix of the current iteration. -+ for task, details in tasks.items(): -+ space_name = details.task_id.taskspace -+ space_name += ts_postfix -+ if space_name not in taskspaces: -+ taskspaces[space_name] = TaskSpace(space_name) - -+ for k in range(0, 4): -+ with cupy.cuda.Device(k): -+ mempool = cupy.get_default_memory_pool() -+ print(f"\t Right Before {k} Used GPU{k}: {mempool.used_bytes()}, Free Mmeory: {mempool.free_bytes()}") -+ -+ graph_start_t = time.perf_counter() -+ -+ execute_tasks(taskspaces, tasks, run_config, ts_postfix, data_list=data_list[i]) -+ -+ for taskspace in taskspaces.values(): -+ await taskspace -+ taskspace = None - graph_end_t = time.perf_counter() - -+ #worker_thread = get_scheduler_context() -+ #worker_thread.scheduler.invoke_all_cparrays_clear() -+ -+ for k in range(0, 4): -+ with cupy.cuda.Device(k): -+ mempool = cupy.get_default_memory_pool() -+ print(f"\t Right After {k} Used GPU{k}: {mempool.used_bytes()}, Free Mmeory: {mempool.free_bytes()}") -+ - graph_elapsed = graph_end_t - graph_start_t -+ print("Iteration:", i, ", execution time:", graph_elapsed, flush=True) - graph_times.append(graph_elapsed) - - graph_times = np.asarray(graph_times) -@@ -597,7 +674,7 @@ def run(tasks: Dict[TaskID, TaskInfo], data_config: Dict[int, DataInfo] = None, - - with Parla(logfile=run_config.logfile): - internal_start_t = time.perf_counter() -- execute_graph(data_config, tasks, run_config, timing) -+ execute_graph_memory2(data_config, tasks, run_config, timing) - internal_end_t = time.perf_counter() - - outer_end_t = time.perf_counter() -@@ -797,12 +874,14 @@ class GraphContext(object): - return self - - def run(self, run_config: RunConfig, max_time: int = 100): -- -+ return run(self.graph, self.data_config, run_config) -+ """ - @timeout(max_time) - def run_with_timeout(): - return run(self.graph, self.data_config, run_config) - - return run_with_timeout() -+ """ - - def __exit__(self, type, value, traceback): - self.diro.__exit__(type, value, traceback) From b2c84851003da2ca9b1b93e61bb3005c6f1a1f9f Mon Sep 17 00:00:00 2001 From: wlruys Date: Thu, 18 Jan 2024 03:46:59 +0000 Subject: [PATCH 5/5] style: apply ruff format --- src/python/parla/common/globals.py | 1 + src/python/parla/common/parray/core.py | 2 +- src/python/parla/common/parray/from_data.py | 4 ++-- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/src/python/parla/common/globals.py b/src/python/parla/common/globals.py index 75d9fe41..afc359b0 100644 --- a/src/python/parla/common/globals.py +++ b/src/python/parla/common/globals.py @@ -72,6 +72,7 @@ def print_config(): print("Default Runahead Behavior: ", default_sync, flush=True) print("VCU Precision: ", VCU_BASELINE, flush=True) + class DeviceType(IntEnum): """ This class declares device types. diff --git a/src/python/parla/common/parray/core.py b/src/python/parla/common/parray/core.py index ec251551..3833b7f5 100644 --- a/src/python/parla/common/parray/core.py +++ b/src/python/parla/common/parray/core.py @@ -217,7 +217,7 @@ def _current_device_index(self) -> int: def set_name(self, name: str): self._name = name - def get(self, device: Optional[PyDevice] = None) -> 'np.ndarray' | 'cp.ndarray': + def get(self, device: Optional[PyDevice] = None) -> "np.ndarray" | "cp.ndarray": if device is None: return self.array else: diff --git a/src/python/parla/common/parray/from_data.py b/src/python/parla/common/parray/from_data.py index 9f72bc0e..930cbb16 100644 --- a/src/python/parla/common/parray/from_data.py +++ b/src/python/parla/common/parray/from_data.py @@ -176,12 +176,12 @@ def get_parray(object, count=0): # recursively process Sequence or Dictionary elif isinstance(object, dict): accumulator = {} for key, value in object.items(): - accumulator[key] = get_parray(value, count+1) + accumulator[key] = get_parray(value, count + 1) return accumulator elif isinstance(object, (list, tuple, set)): accumulator = [] for item in object: - accumulator.append(get_parray(item, count+1)) + accumulator.append(get_parray(item, count + 1)) return type(object)(accumulator) else: raise TypeError(f"Unsupported Type: {type(object)}")