From e17395a2ba7e04fadf78d400f2003dee3e37d899 Mon Sep 17 00:00:00 2001
From: Hochan Lee <hochan@utexas.edu>
Date: Sun, 14 Jan 2024 16:05:10 -0600
Subject: [PATCH 1/5] Fix data dependency

---
 src/c/backend/include/phases.hpp  |    1 +
 src/c/backend/include/runtime.hpp |    6 +
 src/c/backend/phases.cpp          |   85 +-
 src/dev.patch                     | 2498 +++++++++++++++++++++++++++++
 src/python/parla/cython/tasks.pyx |    7 +-
 5 files changed, 2590 insertions(+), 7 deletions(-)
 create mode 100644 src/dev.patch
diff --git a/src/c/backend/include/phases.hpp b/src/c/backend/include/phases.hpp
index 8b3e4a6a..2871c937 100644
--- a/src/c/backend/include/phases.hpp
+++ b/src/c/backend/include/phases.hpp
@@ -277,6 +277,7 @@ class MemoryReserver : virtual public SchedulerPhase {
   * @param task The task to create data movement tasks for.
   */
   void create_datamove_tasks(InnerTask *task);
+  void create_datamove_tasks2(InnerTask *task);
 };
 
 /**
diff --git a/src/c/backend/include/runtime.hpp b/src/c/backend/include/runtime.hpp
index de7da5da..88e06018 100644
--- a/src/c/backend/include/runtime.hpp
+++ b/src/c/backend/include/runtime.hpp
@@ -293,6 +293,8 @@ class InnerTask {
   std::vector<std::vector<std::pair<parray::InnerPArray *, AccessMode>>>
       parray_list;
 
+  std::unordered_map<uint64_t, std::vector<InnerTask*>> parray_dependencies_map;
+
   InnerTask();
   InnerTask(long long int id, void *py_task);
   InnerTask(std::string name, long long int id, void *py_task);
@@ -623,6 +625,10 @@ class InnerTask {
   void begin_multidev_req_addition();
   void end_multidev_req_addition();
 
+  std::vector<InnerTask*>& get_parray_dependencies(uint64_t parray_parent_id) {
+    return this->parray_dependencies_map[parray_parent_id];
+  }
+
   PlacementRequirementCollections &get_placement_req_options() {
     return placement_req_options_;
   }
diff --git a/src/c/backend/phases.cpp b/src/c/backend/phases.cpp
index 0bcb091a..2713e21f 100644
--- a/src/c/backend/phases.cpp
+++ b/src/c/backend/phases.cpp
@@ -237,6 +237,89 @@ void MemoryReserver::create_datamove_tasks(InnerTask *task) {
   task->add_dependencies(data_tasks, true);
 }
 
+
+
+
+void MemoryReserver::create_datamove_tasks2(InnerTask *task) {
+  // Get a list of the parrays the current task holds.
+  const std::vector<std::vector<std::pair<parray::InnerPArray *, AccessMode>>>
+      &parray_list = task->parray_list;
+  std::string task_base_name = task->get_name();
+  std::vector<InnerTask *> data_tasks;
+  data_tasks.reserve(parray_list.size());
+
+  for (size_t i = 0; i < parray_list.size(); ++i) {
+    for (size_t j = 0; j < parray_list[i].size(); ++j) {
+      // Create a data movement task for each PArray.
+      parray::InnerPArray *parray = parray_list[i][j].first;
+      AccessMode access_mode = parray_list[i][j].second;
+      InnerDataTask *datamove_task = new InnerDataTask(
+          // TODO(hc): id should be updated!
+          task_base_name + ".dm." + std::to_string(i), 0, parray, access_mode,
+          i);
+      uint64_t parray_parent_id = parray->get_parent_parray()->id;
+      // Find dependency intersection between compute and data movement tasks.
+
+      // TODO(hc): This is not the complete implementation.
+      //           We will use a concurrent map for parray's
+      //           task list as an optimization.
+
+      std::vector<void *> compute_task_dependencies = task->get_dependencies();
+      std::vector<InnerTask *> data_task_dependencies;
+      for (size_t k = 0; k < compute_task_dependencies.size(); ++k) {
+        InnerTask *parray_dependency =
+            static_cast<InnerTask *>(compute_task_dependencies[k]);
+        std::vector<InnerTask*>& dep_parray_dependencies = 
+            parray_dependency->get_parray_dependencies(parray_parent_id);
+
+        std::cout << parray_dependency->name << " is being traversed\n";
+        for (size_t t = 0; t < dep_parray_dependencies.size(); ++t) {
+          data_task_dependencies.push_back(parray_dependency);
+          // If the current processing parray's access mode is READ ONLY,
+          // add this dependency as a dependency for this parray.
+          std::cout << "access mode:" << int(access_mode) << "\n";
+          if (access_mode == AccessMode::IN) {
+            std::cout << "IN parray is added:" << parray_parent_id << "\n";
+            task->get_parray_dependencies(parray_parent_id).push_back(parray_dependency);
+          }
+        }
+      }
+
+      // If the current processing parray's access mode is not READ ONLY,
+      // add itself as a dependency for this parray.
+      std::cout << task->name << " is being traversed access id :" << int(access_mode) << "\n";
+      if (access_mode != AccessMode::IN) {
+        std::cout << "IN/OUT OUT parray is added:" << parray_parent_id << "\n";
+        task->get_parray_dependencies(parray_parent_id).push_back(task);
+      }
+
+      // TODO(hc): pass false to add_dependencies() as optimization.
+      datamove_task->add_dependencies(data_task_dependencies, true);
+      // Copy assigned devices to a compute task to a data movement task.
+      // TODO(hc): When we support xpy, it should be devices corresponding
+      //           to placements of the local partition.
+      auto device = task->get_assigned_devices()[i];
+      datamove_task->add_assigned_device(device);
+
+      datamove_task->device_constraints.emplace(
+          std::piecewise_construct,
+          std::forward_as_tuple(device->get_global_id()),
+          std::forward_as_tuple(0, 0, 1));
+
+      data_tasks.push_back(datamove_task);
+      // Add the created data movement task to a reserved task queue.
+      this->scheduler->increase_num_active_tasks();
+      this->reserved_tasks_buffer.push_back(datamove_task);
+    }
+  }
+
+  // Create dependencies between data move task and compute tasks.
+  task->add_dependencies(data_tasks, true);
+}
+
+
+
+
 void MemoryReserver::run(SchedulerPhase *next_phase) {
   NVTX_RANGE("MemoryReserver::run", NVTX_COLOR_LIGHT_GREEN)
 
@@ -263,7 +346,7 @@ void MemoryReserver::run(SchedulerPhase *next_phase) {
     if (can_reserve) {
       this->reserve_resources(task);
       this->reservable_tasks->pop();
-      this->create_datamove_tasks(task);
+      this->create_datamove_tasks2(task);
       this->reserved_tasks_buffer.push_back(task);
     } else {
       // TODO:(wlr) we need some break condition to allow the scheduler to
diff --git a/src/dev.patch b/src/dev.patch
new file mode 100644
index 00000000..1789036a
--- /dev/null
+++ b/src/dev.patch
@@ -0,0 +1,2498 @@
+diff --git a/src/c/backend/include/memory_manager.hpp b/src/c/backend/include/memory_manager.hpp
+new file mode 100644
+index 0000000..b1467ec
+--- /dev/null
++++ b/src/c/backend/include/memory_manager.hpp
+@@ -0,0 +1,420 @@
++#ifndef PARLA_MEMORY_MANNGER_HPP
++#define PARLA_MEMORY_MANNGER_HPP
++
++#include "device_manager.hpp"
++#include "parray.hpp"
++
++
++/**
++ * @brief Node type of a PArray eviction double-linked list.
++ */
++class PArrayNode {
++public:
++  PArrayNode(parray::InnerPArray *parr, size_t prior = 0) :
++      parray(parr), priority(prior), next(nullptr), prev(nullptr)
++  {}
++
++  /// Pointer of a PArray instance
++  parray::InnerPArray *parray;
++  /// Priority of the node
++  /// TODO(hc): This is not used, but keep it for the future
++  size_t priority;
++  /// Pointers to the next and the previous PArrayNodes
++  PArrayNode *next;
++  PArrayNode *prev;
++};
++
++/**
++ * @brief Double-linked list of candidate PArrays for eviction.
++ * @details PArray eviction manager selects and evicts PArray instances
++ * in this list depending on an eviction policy.
++ * Note that an eviction manager manages this list for each device.
++ */
++class DoubleLinkedList {
++public:
++
++  /**
++   * @brief Print the current list.
++   */
++  void print() {
++    PArrayNode *node = this->head_;
++    std::cout << "\n";
++    while (node != nullptr) {
++      std::cout << node->parray->id << " -> \n"; 
++      node = node->next;
++    }
++    std::cout << "\n";
++  }
++
++  /**
++   * @brief Append a PArray node to the list.
++   * @detail The first PArray of the list is set to both head and tail, and
++   * the last added PArray is set to tail.
++   *
++   * @param node PArray node to be appended
++   */
++  void append(PArrayNode *node) {
++    this->mtx_.lock();
++    if (this->list_size_ == 0) {
++      this->head_ = node;
++      this->tail_ = node;
++    } else {
++      this->tail_->next = node;
++      node->prev = this->tail_;
++      this->tail_ = node;
++    }
++    this->list_size_ += 1;
++    this->mtx_.unlock();
++  }
++
++  /**
++   * @brief Insert a PArray node between `node` and `node->next`.
++   *
++   * @param node existing PArray node where `new_node` is being linked
++   * @param new_node PArray node to be appended after `node`
++   */
++  void insert_after(PArrayNode *node, PArrayNode *new_node) {
++    this->mtx_.lock();
++    if (node->next != nullptr) {
++      node->next->prev = new_node;
++      new_node->next = node->next;
++    } else {
++      this->tail_ = new_node;
++    }
++    node->next = new_node;
++    new_node->prev = node;
++    this->mtx_.unlock();
++  }
++
++  /**
++   * @brief Insert a PArray node between `node` and `node->prev`.
++   *
++   * @param node existing PArray node where `new_node` is being linked
++   * @param new_node PArray node to be appended before `node`
++   */
++  void insert_before(PArrayNode *node, PArrayNode *new_node) {
++    this->mtx_.lock();
++    if (node->prev != nullptr) {
++      node->prev->next = new_node;
++      new_node->prev = node->prev;
++    } else {
++      this->head_ = new_node;
++    }
++    node->prev = new_node;
++    new_node->next = node;
++    this->mtx_.unlock();
++  }
++
++  /**
++   * @brief Remove and return the current head PArray node from a list.
++   */
++  PArrayNode *remove_head() {
++    this->mtx_.lock();
++    PArrayNode *old_head = this->head_;
++    if (old_head != nullptr) {
++      this->remove_unsafe(old_head); 
++    }
++    this->mtx_.unlock();
++    return old_head;
++  }
++
++  /**
++   * @brief Remove a node and return true if it is removed false otherwise.
++   *
++   * @param node PArray node to be removed from a list
++   */
++  bool remove(PArrayNode *node) {
++    this->mtx_.lock();
++    bool rv = this->remove_unsafe(node);
++    this->mtx_.unlock();
++    return rv;
++  }
++
++  /**
++   * @brief Remove a node and return true if it is removed false otherwise.
++   * This function is not thread safe.
++   *
++   * @param node PArray node to be removed from a list
++   */
++  bool remove_unsafe(PArrayNode *node) {
++    if (node->prev == nullptr && node->next == nullptr &&
++        node != this->head_ && node != this->tail_) {
++      // If a node is not in a list, do nothing and return false.
++      return false;
++    }
++
++    if (this->list_size_ == 1) {
++      // A node is a single node in a list.
++      this->head_ = this->tail_ = nullptr;
++    } else {
++      if (this->head_ == node) {
++        // A node is a head, and so break link of node->next->prev.
++        this->head_ = node->next;
++        node->next->prev = nullptr;
++      } else if (this->tail_ == node) {
++        // A node is a tail, and so break link of node->prev->next.
++        this->tail_ = node->prev;
++        node->prev->next = nullptr;
++      } else {
++        // A node is in the middle of a list, and so break two links.
++        node->prev->next = node->next;
++        node->next->prev = node->prev;
++      }
++    }
++    node->prev = node->next = nullptr;
++    this->list_size_ -= 1;
++    return true;
++  }
++
++  /**
++   * @brief Return a size of a list.
++   */
++  size_t size() {
++    this->mtx_.lock();
++    size_t list_size = this->list_size_;
++    this->mtx_.unlock();
++    return list_size;
++  }
++
++  /**
++   * @brief Return the current head.
++   * This function is not thread safe.
++   */
++  PArrayNode *get_head() {
++    return this->head_;
++  }
++
++  /**
++   * @brief Return the current tail.
++   * This function is not thread safe.
++   */
++  PArrayNode *get_tail() {
++    return this->tail_;
++  }
++
++private:
++  PArrayNode *head_{nullptr};
++  PArrayNode *tail_{nullptr};
++  std::mutex mtx_;
++  size_t list_size_{0};
++};
++
++
++/**
++ * @brief Least-recently-used policy based eviction manager for a device.
++ * @details It holds PArrays which are not referenced by tasks which are
++ * between task mapping and termination phases.
++ */
++class LRUDeviceEvictionManager {
++public:
++  struct PArrayMetaInfo {
++    // Points to a PArray node if it exists
++    PArrayNode *parray_node_ptr;
++    // The number of references to a PArray
++    size_t ref_count;
++  };
++
++  LRUDeviceEvictionManager(DevID_t dev_id) : dev_id_(dev_id) {}
++
++  /**
++   * @brief A task refers `parray` in the device.
++   * @detail This function is called when a task being mapped
++   * refers `parray`. This increases a reference count of the PArray
++   * and removes it from a zero-referenced list if it exists.
++   *
++   * @param parray pointer to a parray to be referred by a task
++   */
++  void grab_parray_reference(parray::InnerPArray *parray) {
++    this->mtx_.lock();
++    uint64_t parray_id = parray->id;
++    auto found = this->parray_reference_counts_.find(parray_id);
++    if (found == this->parray_reference_counts_.end()) {
++      // Add `parray` to a zr list if it does not exist.
++      PArrayNode *parray_node = new PArrayNode(parray);
++      this->parray_reference_counts_[parray_id] =
++          PArrayMetaInfo{parray_node, 1};
++    } else {
++      // If `parray` is already in a zr list, removes it
++      // from the list and increases its reference count.
++      found->second.ref_count++; 
++      this->zr_parray_list_.remove(found->second.parray_node_ptr);
++    }
++    this->mtx_.unlock();
++  }
++
++  /**
++   * @brief A task is finished and releases `parray` in the device.
++   * @detail This function is called by a worker thread when a task
++   * assigned to that thread is completed. The thread releases the
++   * `parray` instance, and decreases its reference count in the device.
++   * If the reference count becomes 0, the `parray` is added to
++   * the zero-referenced list.
++   *
++   * @param parray pointer to a parray to be released by a task
++   */
++  void release_parray_reference(parray::InnerPArray *parray) {
++    this->mtx_.lock();
++    uint64_t parray_id = parray->id;
++    auto found = this->parray_reference_counts_.find(parray_id);
++    if (found != this->parray_reference_counts_.end()) {
++      found->second.ref_count--; 
++      if (found->second.ref_count == 0) {
++        // If none of task referes to `parray`, add it to
++        // a zr list.
++        this->zr_parray_list_.append(found->second.parray_node_ptr);
++      }
++    }
++    this->mtx_.unlock();
++  }
++
++
++  /**
++   * @brief Return a size of a list.
++   */
++  size_t size() {
++    size_t zr_parray_list_size{0};
++    this->mtx_.lock();
++    zr_parray_list_size = zr_parray_list_.size();
++    this->mtx_.unlock();
++    return zr_parray_list_size;
++  }
++
++  /**
++   * @brief Remove and return a head of the zero-referenced list.
++   * @detail This function is not thread safe since it assumes that only
++   * the scheduler thread calls into this function during eviction. 
++   */
++  PArrayNode *remove_and_return_head_from_zrlist() {
++    PArrayNode* old_head{nullptr};
++    this->mtx_.lock();
++    old_head = this->zr_parray_list_.remove_head();
++    this->mtx_.unlock();
++    return old_head;
++  }
++
++  /**
++   * @brief This function clears all existing PArrays in the
++   * zero-referenced list.
++   * @detail This function has two purposes.
++   * First, it is used to fix unlinked Python and C++ PArray
++   * instances. It is possible that Python PArrays are destroyed
++   * due to, for example, out-of-scope. Then, C++ PArrays
++   * start to hold invalid Python PArray pointers.
++   * When a scheduler starts PArray eviction, it is possible that
++   * the C++ PArrays holding invalid Python PArrays are chosen 
++   * as evictable PArrays and causes segmentation fault.
++   * This function removes those PArrays in advance to avoid
++   * this issue (But users should be aware of and take care of this scenario).
++   * The second purpose is to allow users to clear all memory
++   * related states managed by the Parla runtime.
++   */
++  // TODO(hc): This bulk flushing is not ideal IMO. The Parla runtime
++  //           should provide a function that flushes only a single PArray.
++  //           I am postponing this work since we need to take care of
++  //           the zero-referenced list, but I have higher priorities.
++  void clear_all_instances() {
++    this->mtx_.lock();
++    PArrayNode* head{nullptr};
++    do {
++      head = this->zr_parray_list_.remove_head(); 
++    } while (head != nullptr);
++    this->mtx_.unlock();
++  }
++
++private:
++  /// This eviction manager manages PArray instances in this device
++  DevID_t dev_id_;
++  std::mutex mtx_;
++  /// Key: PArray ID, Value: Meta information including reference
++  /// count of a PArray
++  std::unordered_map<uint64_t, PArrayMetaInfo> parray_reference_counts_;
++  /// A list of zero-referenced PArrays.
++  DoubleLinkedList zr_parray_list_;
++};
++
++
++/**
++ * @brief Least-recently-used policy based global eviction manager.
++ * @details External components access and manipulate PArray instances in any
++ * device through this manager.
++ */
++class LRUGlobalEvictionManager {
++public:
++  LRUGlobalEvictionManager(DeviceManager *device_manager) :
++    device_manager_(device_manager) {
++    this->device_mm_.resize(
++        device_manager->template get_num_devices<DeviceType::All>());
++    for (size_t i = 0; i < this->device_mm_.size(); ++i) {
++      this->device_mm_[i] = new LRUDeviceEvictionManager(i);
++    }
++  }
++
++  /**
++   * @brief A task refers `parray` in `dev_id` device.
++   * @detail This function is called when a task being mapped
++   * refers `parray`. This increases a reference count of the PArray
++   * and removes it from a zero-referenced list if it exists.
++   *
++   * @param parray pointer to a parray to be referred by a task
++   * @param dev_id device id of a device to access its information
++   */
++  void grab_parray_reference(parray::InnerPArray *parray, DevID_t dev_id) {
++    this->device_mm_[dev_id]->grab_parray_reference(parray);
++  }
++
++  /**
++   * @brief A task is finished and releases `parray` in `dev_id` device.
++   * @detail This function is called by a worker thread when a task
++   * assigned to that completes. So, the thread also releases a
++   * `parray`, and decreases a reference count of that in the device.
++   * If the reference count becomes 0, the `parray` is added to
++   * its zero-referenced list.
++   *
++   * @param parray pointer to a parray to be released by a task
++   * @param dev_id device id of a device to access its information
++   */
++  void release_parray_reference(parray::InnerPArray *parray, DevID_t dev_id) {
++    this->device_mm_[dev_id]->release_parray_reference(parray);
++  }
++
++  /**
++   * @brief Return a size of a list.
++   *
++   * @param dev_id device id of a device to access its information
++   */
++  size_t size(DevID_t dev_id) {
++    return this->device_mm_[dev_id]->size();
++  }
++
++  /**
++   * @brief Remove and return a head of the zero-referenced list.
++   * @detail This function is not thread safe since it assumes that only
++   * the scheduler thread calls into this function during eviction. 
++   *
++   * @param dev_id device id of a device to access its information
++   */
++  void *remove_and_return_head_from_zrlist(DevID_t dev_id) {
++    PArrayNode *old_head =
++        this->device_mm_[dev_id]->remove_and_return_head_from_zrlist();
++    void *py_parray{nullptr};
++    if (old_head != nullptr) {
++      parray::InnerPArray *c_parray = old_head->parray;
++      py_parray = c_parray->get_py_parray();
++    }
++    return py_parray;
++  }
++
++  void clear_all_instances() {
++    for (size_t i = 0; i < device_mm_.size(); ++i) {
++      device_mm_[i]->clear_all_instances();
++    }
++  }
++
++private:
++  /// Device manager managing system environment
++  DeviceManager *device_manager_;
++  /// A list of LRU-based eviction manager for each device
++  std::vector<LRUDeviceEvictionManager *> device_mm_;
++};
++
++#endif
+diff --git a/src/c/backend/include/parray.hpp b/src/c/backend/include/parray.hpp
+index 68d3de6..aa6fa56 100644
+--- a/src/c/backend/include/parray.hpp
++++ b/src/c/backend/include/parray.hpp
+@@ -30,8 +30,8 @@ enum AccessMode {
+   INOUT = 2,
+   /// Output of a task.
+   OUT = 3,
+-  /// Removed PArray (false everywhere).
+-  REMOVED = 4,
++  /// Freed PArray (false everywhere).
++  FREED = 4,
+   /// Deleted PArray (removed from table).
+   DELETED = 5,
+ };
+@@ -59,7 +59,7 @@ public:
+   /// compared to restructuring overheads), but this counter is decreased.
+   /// This is used to provide more accurate PArray placement information
+   /// to the task mapping step.
+-  std::vector<CopyableAtomic<size_t>> num_active_tasks;
++  std::vector<CopyableAtomic<size_t>> num_referring_tasks;
+ 
+   InnerPArray() = delete;
+   InnerPArray(void *, uint64_t, uint64_t, InnerPArray *, PArrayState *,
+@@ -92,14 +92,14 @@ public:
+   /// Add a pointer of the task that will use this PArray to the task list
+   void add_task(InnerTask *task);
+ 
+-  /// Increase the counter for the active tasks that use this PArray.
+-  void incr_num_active_tasks(DevID_t global_dev_id);
++  /// Increase the number of the tasks referring this PArray.
++  void incr_num_referring_tasks(DevID_t global_dev_id);
+ 
+-  /// Decrease the counter for the active tasks that use this PArray.
+-  void decr_num_active_tasks(DevID_t global_dev_id);
++  /// Decrease the number of the tasks referring this PArray.
++  void decr_num_referring_tasks(DevID_t global_dev_id);
+ 
+-  /// Get the number of counter for the active tasks that use this PArray.
+-  size_t get_num_active_tasks(DevID_t global_dev_id);
++  /// Get the number of the tasks referring this PArray.
++  size_t get_num_referring_tasks(DevID_t global_dev_id);
+ 
+   // TODO(hc): I will replace this list with a concurrent map.
+   /// Get a reference to a list of tasks who are using this PArray
+@@ -136,8 +136,8 @@ private:
+   //           I will use this map: https://github.com/greg7mdp/parallel-hashmap
+   //           I have used this for a while and it is good.
+   TaskList _task_lists;
+-
++  /// Python PArray address
+   void *_py_parray;
+ };
+ 
+-} // namespace parray
+\ No newline at end of file
++} // namespace parray
+diff --git a/src/c/backend/include/parray_tracker.hpp b/src/c/backend/include/parray_tracker.hpp
+index b49c62f..c46141b 100644
+--- a/src/c/backend/include/parray_tracker.hpp
++++ b/src/c/backend/include/parray_tracker.hpp
+@@ -13,6 +13,10 @@
+ 
+ using namespace parray;
+ 
++/**
++ * @brief PArray tracker that tracks PArray mapping state.
++ * Note that this does not track slice PArrays, but a complete PArray.
++ */
+ class PArrayTracker {
+ public:
+   PArrayTracker(size_t num_devices) : num_devices_(num_devices) {
+@@ -204,7 +208,7 @@ protected:
+ 
+   /// Any worker thread can update states of PArrays.
+   /// Guard operations by this lock.
+-  /// TODO(hc): This will be replaced with parallel hash map.
++  /// TODO(hc): This will be replaced with parallel hash map (phmap).
+   std::mutex mtx;
+ };
+ 
+diff --git a/src/c/backend/include/runtime.hpp b/src/c/backend/include/runtime.hpp
+index f38cfac..17f72b3 100644
+--- a/src/c/backend/include/runtime.hpp
++++ b/src/c/backend/include/runtime.hpp
+@@ -44,6 +44,7 @@
+ #include "parray_tracker.hpp"
+ #include "profiling.hpp"
+ #include "resource_requirements.hpp"
++#include "memory_manager.hpp"
+ 
+ using namespace std::chrono_literals;
+ using namespace parray;
+@@ -985,9 +986,26 @@ public:
+   /* Should Run, Stop Condition */
+   std::atomic<bool> should_run = true;
+ 
++  /* Clear all Python PArray references if users request */ 
++  std::atomic<bool> clear_all_pyparrays = false;
++  /* Clear all C PArray objects if users request in eviction manager */ 
++  std::atomic<bool> clear_all_cparrays = false;
++
+   /* Phase: maps tasks to devices */
+   Mapper *mapper;
+ 
++  /* If it is set, break an infinite loop in InnerScheduler::run()
++     and invoke PArray eviction from PythonScheduler */
++  bool break_for_eviction = false;
++  /* Memory size to evict for each device */
++  std::vector<size_t> memory_size_to_evict{0};
++
++  /* Set necessary memory size getting from eviction manager
++     on each device */
++  void set_memory_size_to_evict(size_t, DevID_t);
++  /* Get memory size to evict for each device */
++  size_t get_memory_size_to_evict(DevID_t);
++
+   /* Phase reserves resources to limit/plan task execution*/
+   MemoryReserver *memory_reserver;
+   RuntimeReserver *runtime_reserver;
+@@ -995,7 +1013,7 @@ public:
+   /*Responsible for launching a task. Signals worker thread*/
+   Launcher *launcher;
+ 
+-  InnerScheduler(DeviceManager *device_manager);
++  InnerScheduler(LRUGlobalEvictionManager *memory_manager, DeviceManager *device_manager);
+   ~InnerScheduler();
+   // InnerScheduler(int nworkers);
+ 
+@@ -1017,6 +1035,9 @@ public:
+   /* Set Python "stop" callback */
+   void set_stop_callback(stopfunc_t stop_callback);
+ 
++  /* Get a flag that represents if there is still a task to be executed */
++  bool get_should_run();
++
+   /* Run the scheduler thread. Active for the lifetime of the Parla program */
+   void run();
+ 
+@@ -1087,6 +1108,20 @@ public:
+   /* Release a PArray in a device */
+   void remove_parray(InnerPArray *parray, DevID_t global_dev_id);
+ 
++  /* Reflect a PArray removal to PArray trackers */
++  void remove_parray_from_tracker(
++      parray::InnerPArray *parray, DevID_t global_dev_id);
++
++  void grab_parray_reference(
++      parray::InnerPArray *parray, DevID_t global_dev_id) {
++    this->mm_->grab_parray_reference(parray, global_dev_id);
++  }
++
++  void release_parray_reference(
++      parray::InnerPArray *parray, DevID_t global_dev_id) {
++    this->mm_->release_parray_reference(parray, global_dev_id);
++  }
++
+   /* Get mapped memory on device */
+   size_t get_mapped_memory(DevID_t global_dev_id);
+ 
+@@ -1108,10 +1143,17 @@ public:
+ 
+   DeviceManager *get_device_manager() { return this->device_manager_; }
+ 
++  /* Invoke clearing all C PArray instances from a PArray eviction manager. */
++  void invoke_all_cparrays_clear();
++  
++  /* Get a flag that represents whether Python PArrays should be cleared */
++  bool get_all_pyparrays_clear_flag();
+ protected:
+   /// It manages all device instances in C++.
+   /// This is destructed by the Cython scheduler.
+   DeviceManager *device_manager_;
++
++  LRUGlobalEvictionManager *mm_;
+ };
+ 
+ #endif // PARLA_BACKEND_HPP
+diff --git a/src/c/backend/memory_manager.cpp b/src/c/backend/memory_manager.cpp
+new file mode 100644
+index 0000000..f829294
+--- /dev/null
++++ b/src/c/backend/memory_manager.cpp
+@@ -0,0 +1 @@
++#include "include/memory_manager.hpp"
+diff --git a/src/c/backend/parray.cpp b/src/c/backend/parray.cpp
+index 5e555d4..20cb46a 100644
+--- a/src/c/backend/parray.cpp
++++ b/src/c/backend/parray.cpp
+@@ -9,7 +9,7 @@ InnerPArray::InnerPArray(void *py_parray, uint64_t id, uint64_t parent_id,
+                          DevID_t num_devices)
+     : _py_parray(py_parray), id(id), parent_id(parent_id),
+       _parent_parray(parent_parray), _state(state), _num_devices(num_devices) {
+-  num_active_tasks.resize(num_devices);
++  num_referring_tasks.resize(num_devices);
+ }
+ 
+ const uint64_t InnerPArray::get_size() const { return this->_size; }
+@@ -29,32 +29,32 @@ void InnerPArray::add_task(InnerTask *task) {
+   this->_task_lists.push_back(task);
+ }
+ 
+-void InnerPArray::incr_num_active_tasks(DevID_t global_dev_id) {
++void InnerPArray::incr_num_referring_tasks(DevID_t global_dev_id) {
+   if (this->_parent_parray != nullptr) {
+-    this->_parent_parray->num_active_tasks[global_dev_id].fetch_add(
++    this->_parent_parray->num_referring_tasks[global_dev_id].fetch_add(
+         1, std::memory_order_relaxed);
+   } else {
+-    this->num_active_tasks[global_dev_id].fetch_add(1,
++    this->num_referring_tasks[global_dev_id].fetch_add(1,
+                                                     std::memory_order_relaxed);
+   }
+ }
+ 
+-void InnerPArray::decr_num_active_tasks(DevID_t global_dev_id) {
++void InnerPArray::decr_num_referring_tasks(DevID_t global_dev_id) {
+   if (this->_parent_parray != nullptr) {
+-    this->_parent_parray->num_active_tasks[global_dev_id].fetch_sub(
++    this->_parent_parray->num_referring_tasks[global_dev_id].fetch_sub(
+         1, std::memory_order_relaxed);
+   } else {
+-    this->num_active_tasks[global_dev_id].fetch_sub(1,
++    this->num_referring_tasks[global_dev_id].fetch_sub(1,
+                                                     std::memory_order_relaxed);
+   }
+ }
+ 
+-size_t InnerPArray::get_num_active_tasks(DevID_t global_dev_id) {
++size_t InnerPArray::get_num_referring_tasks(DevID_t global_dev_id) {
+   if (this->_parent_parray != nullptr) {
+-    return this->_parent_parray->num_active_tasks[global_dev_id].load(
++    return this->_parent_parray->num_referring_tasks[global_dev_id].load(
+         std::memory_order_relaxed);
+   } else {
+-    return this->num_active_tasks[global_dev_id].load(
++    return this->num_referring_tasks[global_dev_id].load(
+         std::memory_order_relaxed);
+   }
+ }
+diff --git a/src/c/backend/parray_tracker.cpp b/src/c/backend/parray_tracker.cpp
+index ac5e88d..e5226d7 100644
+--- a/src/c/backend/parray_tracker.cpp
++++ b/src/c/backend/parray_tracker.cpp
+@@ -12,7 +12,6 @@ size_t PArrayTracker::do_parray_creation_(AccessMode access_mode,
+   if (access_mode >= AccessMode::OUT || is_tracked) {
+     return 0;
+   } else {
+-
+     size_t to_move = parray->get_size();
+ 
+     // std::cout << "PArrayTracker::do_parray_creation " << std::endl;
+@@ -65,8 +64,8 @@ size_t PArrayTracker::do_parray_removal_(AccessMode access_mode, DevID_t dev_id,
+       this->set_parray_unsafe(i, parray_id, false);
+     }
+ 
+-    if (access_mode != AccessMode::REMOVED) {
+-      this->set_parray_unsafe(dev_id, parray_id, false);
++    if (access_mode != AccessMode::FREED) {
++      this->set_parray_unsafe(dev_id, parray_id, true);
+     }
+ 
+     if (is_slice) {
+@@ -92,7 +91,6 @@ size_t PArrayTracker::do_parray_removal_(AccessMode access_mode, DevID_t dev_id,
+         }
+       }
+       */
+-
+     } else {
+       // If the PArray is not a slice, it is a parent
+       // invalidate all its children on all devices except the target
+@@ -104,7 +102,7 @@ size_t PArrayTracker::do_parray_removal_(AccessMode access_mode, DevID_t dev_id,
+           this->set_parray_unsafe(i, child_id, false);
+         }
+ 
+-        if (access_mode != AccessMode::REMOVED) {
++        if (access_mode != AccessMode::FREED) {
+           this->set_parray_unsafe(dev_id, child_id, false);
+         }
+       }
+@@ -217,4 +215,4 @@ size_t PArrayTracker::check_log(const DevID_t dev_id,
+   // std::cout << "Checked parray removal: " << to_move << std::endl;
+ 
+   return to_move;
+-}
+\ No newline at end of file
++}
+diff --git a/src/c/backend/phases.cpp b/src/c/backend/phases.cpp
+index 79eef62..6975c82 100644
+--- a/src/c/backend/phases.cpp
++++ b/src/c/backend/phases.cpp
+@@ -93,7 +93,7 @@ void Mapper::map_task(InnerTask *task, DeviceRequirementList &chosen_devices) {
+       mapped_pool.increase<Resource::Memory>(mapped_size);
+ 
+       InnerPArray *parray = parray_access.first;
+-      parray->incr_num_active_tasks(global_dev_id);
++      parray->incr_num_referring_tasks(global_dev_id);
+     }
+   }
+ 
+@@ -132,6 +132,14 @@ void Mapper::run(SchedulerPhase *next_phase) {
+   this->drain_parray_buffer();
+ 
+   // TODO Fix Issue #108
++  /** XXX(hc): revisit this
++  // Accumulate necessary memory size for PArrays being mapped.
++  std::vector<size_t> accum_necessary_memory;
++  // Eviction manager does not evict CPU instances; In the future,
++  // this also might be evicted to other devices. But to simplify indexing,
++  // allocate as the number of total devices.
++  accum_necessary_memory.resize(this->device_manager->get_num_devices(DeviceType::All));
++  ***/
+   while (has_task && num_task_mapping_attempt < 20) {
+ 
+     this->drain_parray_buffer();
+@@ -277,6 +285,9 @@ bool MemoryReserver::check_data_resources(InnerTask *task) {
+       InnerPArray *parray = parray_access.first;
+       AccessMode access_mode = parray_access.second;
+ 
++      // Register this PArray to eviction manager's table
++      this->scheduler->grab_parray_reference(parray, local_device_idx);
++
+       // If the PArray is not an input, then we don't need to check size
+       // Note(@dialecticDolt):
+       // There is literally no such thing as an out type in our syntax why do we
+@@ -305,6 +316,10 @@ bool MemoryReserver::check_data_resources(InnerTask *task) {
+ 
+     status = status && device_status;
+     if (!status) {
++      // If a device has not enough memory, activate eviction manager
++      this->scheduler->set_memory_size_to_evict(
++          size_on_device * 2, local_device_idx);
++      this->scheduler->break_for_eviction = true;
+       break;
+     }
+   }
+diff --git a/src/c/backend/policy.cpp b/src/c/backend/policy.cpp
+index 1cf1c76..89a85fb 100644
+--- a/src/c/backend/policy.cpp
++++ b/src/c/backend/policy.cpp
+@@ -12,6 +12,8 @@ bool LocalityLoadBalancingMappingPolicy::calc_score_devplacement(
+   // std::cout << "[Locality-aware- and Load-balancing mapping policy]\n";
+ 
+   // Check device resource availability.
++  // Note that this counter is tracking 'launched' or 'reserved' resource
++  // status.
+   if (!device.check_resource_availability(dev_placement_req.get())) {
+     // std::cout << "Device resource failure!" << std::endl;
+     return false;
+diff --git a/src/c/backend/scheduler.cpp b/src/c/backend/scheduler.cpp
+index a6fadfc..95152aa 100644
+--- a/src/c/backend/scheduler.cpp
++++ b/src/c/backend/scheduler.cpp
+@@ -126,8 +126,18 @@ template class WorkerPool<WorkerQueue, WorkerQueue>;
+ 
+ // Scheduler Implementation
+ 
+-InnerScheduler::InnerScheduler(DeviceManager *device_manager)
+-    : device_manager_(device_manager) {
++InnerScheduler::InnerScheduler(LRUGlobalEvictionManager* memory_manager,
++                               DeviceManager *device_manager)
++    : device_manager_(device_manager), mm_(memory_manager) {
++
++  // For now, it does not evict PArrays on CPU memory.
++  this->memory_size_to_evict.resize(
++      device_manager->template get_num_devices<DeviceType::All>());
++
++  // A dummy task count is used to keep the scheduler alive.
++  // NOTE: At least one task must be added to the scheduler by the main thread,
++  // otherwise the runtime will finish immediately
++  // this->increase_num_active_tasks();
+ 
+   this->workers.set_num_workers(1);
+ 
+@@ -157,31 +167,70 @@ void InnerScheduler::set_stop_callback(stopfunc_t stop_callback) {
+   this->stop_callback = stop_callback;
+ }
+ 
++bool InnerScheduler::get_should_run() {
++  return this->should_run.load();
++}
++
++bool InnerScheduler::get_all_pyparrays_clear_flag() {
++  return this->clear_all_pyparrays.load();
++}
++
++void InnerScheduler::set_memory_size_to_evict(
++    size_t size, DevID_t dev_id) {
++  this->memory_size_to_evict[dev_id] = size;
++}
++
++size_t InnerScheduler::get_memory_size_to_evict(DevID_t dev_id) {
++  return this->memory_size_to_evict[dev_id];
++}
++
+ void InnerScheduler::run() {
+   NVTX_RANGE("Scheduler::run", NVTX_COLOR_RED)
+-  unsigned long long iteration_count = 0;
++  this->clear_all_cparrays = false;
++  this->clear_all_pyparrays = false;
+   while (this->should_run.load()) {
++    this->break_for_eviction = false;
+     auto status = this->activate();
+     if (this->sleep_flag) {
+       std::this_thread::sleep_for(std::chrono::milliseconds(this->sleep_time));
+     }
++    if (this->break_for_eviction) {
++      // Yield a control to a Python scheduler to evict PArrays since
++      // PArray coherency protocol is managed at there.
++      break;
++    }
++    if (this->clear_all_cparrays.load()) {
++      // TODO(hc): This should be more generalized and refined.
++      // Temporarily use it as experimental puprose.
++      std::cout << "Clear all C/Python parrays..\n";
++      this->mm_->clear_all_instances();
++      this->clear_all_pyparrays = true;
++      break;
++    }
+   }
+ }
+ 
+ void InnerScheduler::stop() {
+   LOG_INFO(SCHEDULER, "Stopping scheduler");
+   this->should_run = false;
+-  launch_stop_callback(this->stop_callback, this->py_scheduler);
++  // XXX(hc): To process PArray eviction on Python,
++  // Python scheduler now has an while loop that iterates until there is
++  // no more task, and it wraps C scheduler's loop.
++  // Therefore, there is no point for C++ scheduler to explicitly invoke
++  // this callback at here. Python scheduler knows when it needs to stop.
++  //launch_stop_callback(this->stop_callback, this->py_scheduler);
+   LOG_INFO(SCHEDULER, "Stopped scheduler");
+ }
+ 
++void InnerScheduler::invoke_all_cparrays_clear() {
++  this->clear_all_cparrays = true; 
++}
++
+ Scheduler::Status InnerScheduler::activate() {
+   // std::cout<< "Scheduler Activated" << std::endl;
+-
+   this->mapper->run(this->memory_reserver);
+   this->memory_reserver->run(this->runtime_reserver);
+   this->runtime_reserver->run(this->launcher);
+-
+   // LOG_TRACE(SCHEDULER, "ReadyPhase Status: {}", this->runtime_reserver);
+   return this->status;
+ }
+@@ -289,6 +338,15 @@ void InnerScheduler::remove_parray(InnerPArray *parray, DevID_t global_dev_id) {
+   // Could also be to call DELETED or REMOVED status on do_log
+ }
+ 
++void InnerScheduler::remove_parray_from_tracker(
++    parray::InnerPArray *parray, DevID_t global_dev_id) {
++  AccessMode access_mode = AccessMode::FREED;
++  this->mapper->get_parray_tracker()->do_log(global_dev_id,
++      std::make_pair(parray, access_mode));
++  this->memory_reserver->get_parray_tracker()->do_log(global_dev_id,
++      std::make_pair(parray, access_mode));
++}
++
+ size_t InnerScheduler::get_mapped_memory(DevID_t global_dev_idx) {
+   Device *device =
+       this->device_manager_->get_device_by_global_id(global_dev_idx);
+@@ -362,7 +420,12 @@ void InnerScheduler::task_cleanup_postsync(InnerWorker *worker, InnerTask *task,
+       for (size_t j = 0; j < parray_access_list.size(); ++j) {
+         auto &parray_access = parray_access_list[j];
+         InnerPArray *parray = parray_access.first;
+-        parray->decr_num_active_tasks(dev_id);
++        parray->decr_num_referring_tasks(dev_id);
++        // Decrease this PArray's reference count.
++        // If this becomes 0, this instance will be release
++        // when the PArray coherency protocol updates it
++        // to eviction state.
++        this->release_parray_reference(parray, dev_id);
+       }
+     }
+ 
+diff --git a/src/python/parla/__init__.py b/src/python/parla/__init__.py
+index ba53b56..49ca47a 100644
+--- a/src/python/parla/__init__.py
++++ b/src/python/parla/__init__.py
+@@ -7,6 +7,7 @@ from .cython import core
+ from .cython import device_manager
+ from .cython import device
+ from .cython import variants
++from .cython import mm
+ from .common.spawn import spawn
+ from .common import parray
+ 
+@@ -22,6 +23,7 @@ TaskSpace = tasks.TaskSpace
+ Tasks = tasks.TaskCollection
+ 
+ DeviceManager = device_manager.PyDeviceManager
++PyMM = mm.PyMM
+ 
+ Stream = device.Stream
+ create_env = tasks.create_env
+@@ -59,6 +61,7 @@ class Parla:
+         self.sig = sig_type
+         self.handle_interrupt = True
+         self._device_manager = DeviceManager(dev_config_file)
++        self._memory_manager = PyMM(self._device_manager)
+ 
+         if logfile is None:
+             logfile = os.environ.get("PARLA_LOGFILE", None)
+@@ -77,7 +80,9 @@ class Parla:
+         if hasattr(self, "_sched"):
+             raise ValueError(
+                 "Do not use the same Parla object more than once.")
+-        self._sched = self.scheduler_class(self._device_manager, **self.kwds)
++        self._sched = self.scheduler_class(self._memory_manager,
++                                           self._device_manager,
++                                           **self.kwds)
+ 
+         self.interuppted = False
+         self.released = False
+diff --git a/src/python/parla/common/parray/@ b/src/python/parla/common/parray/@
+new file mode 100644
+index 0000000..af841d1
+--- /dev/null
++++ b/src/python/parla/common/parray/@
+@@ -0,0 +1,541 @@
++from __future__ import annotations
++from typing import TYPE_CHECKING, Union, List, Dict, Tuple, Any
++
++import numpy
++import ctypes
++
++#TODO: Fix this to be more stable and less of a hack.
++try:
++    import cupy
++except (ImportError, AttributeError):
++    import numpy as cupy
++
++from .coherence import CPU_INDEX
++
++if TYPE_CHECKING:  # False at runtime
++    import cupy
++    ndarray = Union[numpy.ndarray, cupy.ndarray]
++    SlicesType = Union[slice, int, tuple]
++    IndicesMapType = List[Union[Dict[int, int], tuple]]
++    from parla.cython.cyparray_state import CyPArrayState
++
++class MultiDeviceBuffer:
++    """Underlying Buffer of PArray.
++
++    It holds per device array copy and also index mapping.
++    """
++
++    _buffer: Dict[int, ndarray | List[ndarray] | None]
++    shape: tuple
++    _indices_map: Dict[int, List[IndicesMapType] | None]
++    _cyparray_state: CyPArrayState
++
++    def __init__(self, num_gpu: int, cyparray_state: CyPArrayState):
++        # per device buffer
++        # key: device_id
++        # val: single (complete) ndarray or list of (sub) ndarray
++        self._buffer = {n: None for n in range(num_gpu)}  # add gpu id
++        self._buffer[CPU_INDEX] = None  # add cpu id
++
++        # per device indices mapping
++        # key: device_id
++        # val: list of {global_index: local_index} and tuple(begin, end, stop), and the tuple is a represent of slice(begin, end, stop)
++        self._indices_map = {n: None for n in range(num_gpu)}
++        self._indices_map[CPU_INDEX] = None
++
++        # the shape of the complete array
++        self.shape = ()
++
++        self._cyparray_state = cyparray_state
++
++    def nbytes_at(self, device_id:int) -> int:
++        """
++        Return the buffer size at `device_id`
++        """
++        buffer = self._buffer[device_id]
++        if buffer is None:
++            return 0
++        elif isinstance(buffer, list): # subarray at this device buffer
++            # size is the sum
++            nbytes = 0
++            for subarray in buffer:
++                nbytes += subarray.nbytes
++            return nbytes
++        else:  # complete array
++            return buffer.nbytes
++
++    def set_complete_array(self, array: ndarray) -> int:
++        """
++        Add array into the buffer (based on array's device).
++
++        Args:
++            array: :class:`cupy.ndarray` or :class:`numpy.array` object
++
++        Return:
++            a location (device_id) of the array
++        """
++        # get the array's location
++        if isinstance(array, numpy.ndarray):
++            location = CPU_INDEX
++        else:
++            location = int(array.device)
++
++        self._buffer[location] = array
++        self.shape = array.shape
++        self._cyparray_state.set_exist_on_device(location, True)
++        return location
++
++    def set(self, device_id: int, array: ndarray, is_complete: bool = True, overwrite: bool = False) -> None:
++        """
++        Set copy at a device, also clean up existing `indices_map` if necessary
++
++        Args:
++            device_id: gpu device_id or CPU_INDEX
++            array: :class:`cupy.ndarray` or :class:`numpy.array` object
++            is_complete: True if `array` is a complete copy, otherwise `array` is a subarray
++            overwrite: True if need to clean other subarray copy inside the device before assign the new array
++        """
++        if is_complete:
++            self._indices_map[device_id] = None
++            self._buffer[device_id] = array
++        else:
++            if not isinstance(self._buffer[device_id], List) or overwrite:
++                self._indices_map[device_id] = None
++                self._buffer[device_id] = [array]
++            else:
++                self._buffer[device_id].append(array)
++        self._cyparray_state.set_exist_on_device(device_id, True)
++
++    def get(self, device_id: int) -> ndarray | List[ndarray] | None:
++        """
++        Return the copy at a device
++
++        Args:
++            device_id: gpu device_id or CPU_INDEX
++
++        Return
++            :class:`cupy.ndarray` or :class:`numpy.array` object
++        """
++        return self._buffer[device_id]
++
++    def get_global_slices(self, device_id:int, subarray_index:int) -> SlicesType | None:
++        """
++        Return global slices of one copy at the device.
++
++        If the copy is complete, return None
++        """
++        if self._indices_map[device_id] is None:
++            return None
++        else:
++            slices = []
++            for device_indices in self._indices_map[device_id][subarray_index]:
++                if isinstance(device_indices, dict):
++                    index = list(device_indices.keys())
++                    if len(index) == 1:
++                        slices.append(index[0])
++                    else:
++                        slices.append(index)
++                else:
++                    slices.append(slice(*device_indices))
++
++            return tuple(slices)
++
++    @staticmethod
++    def _map_int_with_int_map(n: int, int_map: Dict[int, int]) -> int | None:
++        """
++        Find the mapping of `n` in `int_map`
++
++        if `n` not in `int_map`, return None
++
++        example:
++            n: 2
++            int_map: {1:0, 2:1}
++            return: 1
++        """
++        return None if n not in int_map else int_map[n]
++
++    @staticmethod
++    def _map_int_with_slice(n: int, target_slice: tuple) -> int | None:
++        """
++        Find the mapping of `n` in a `target_slice` (find index of `n` in `target_slice`)
++        `target_slice` is a tuple(begin, end, step)
++
++        if `n` not in `target_slice`, return None
++
++        example:
++            n: 2
++            target_slice: (2, 4, 1)
++            return: 0
++        """
++        # TODO: assume slice is simple (no neg value)
++        begin, end, step = target_slice
++        step = 1 if step is None else step
++
++        # bound checking
++        if n < begin or n >= end:
++            return None
++        if (n - begin) % step != 0:
++            return None
++
++        return (n - begin) // step
++
++    @staticmethod
++    def _map_slice_with_slice(input_slice: tuple, target_slice: tuple) -> tuple | None:
++        """
++        Find the mapping of `input_slice` in a `target_slice`
++        `input_slice` and `target_slice` is a tuple(begin, end, step)
++
++        if `input_slice` not a subset of `target_slice`, return None
++
++        example:
++            input_slice: (2, 10, 4)
++            target_slice: (0, 10, 2)
++            return: (1, 5, 2)
++        """
++        # TODO: assume slice is simple (no neg value)
++        target_begin, target_end, target_step = target_slice
++        target_step = 1 if target_step is None else target_step
++
++        input_begin, input_end, input_step = input_slice
++        input_step = 1 if input_step is None else input_step
++
++        mapped_begin = MultiDeviceBuffer._map_int_with_slice(
++            input_begin, target_slice)
++
++        # get the last possible element in range of `input_slice`
++        # TODO: what if last_element < input_begin ?
++        last_element = input_end - input_step + (input_end - input_begin) % input_step
++        mapped_end = MultiDeviceBuffer._map_int_with_slice(last_element, target_slice)
++
++        if mapped_begin is None or mapped_end is None:
++            return None
++
++        # adjust step
++        if input_step % target_step != 0:
++            return None
++        mapped_step = input_step // target_step
++
++        return mapped_begin, mapped_end + 1, mapped_step  # tuple
++
++    def map_local_slices(self, device_id: int, global_slices: SlicesType) -> (int, SlicesType):
++        """
++        Map a given global slices to local slices wrt buffer at the device.
++
++        Raise error if `global_slices` out of range
++
++        Return subarray_index: the index of subarray in the list of `_buffer[device_id]`
++               local_slices: the local slices which maps to the `global_slices`
++        Note: this method assume a indices mapping exists for this device
++        """
++        # indexing into the whole array, index of out bound
++        not_tuple = False
++        if not isinstance(global_slices, tuple):  # if not a tuple, make it a tuple
++            global_slices = tuple([global_slices])
++            not_tuple = True
++
++        local_slices = []
++
++        if len(self.shape) < len(global_slices):
++            raise IndexError(f"index out of range, index:{global_slices}")
++
++        final_subarray_index = 0
++
++        for subarray_index in range(len(self._indices_map[device_id])):  # for each subarray at this device
++            indices_map = self._indices_map[device_id][subarray_index]
++
++            for d in range(len(global_slices)):
++                size = self.shape[d]  # number of entries at this axis
++                global_index = global_slices[d]
++                index_map = None if d >= len(indices_map) else indices_map[d]
++
++                if index_map is None:  # None means 1:1 map to all elements at this axis
++                    local_index = global_index
++                elif isinstance(index_map, dict) and len(index_map) == 1:
++                    # special case, this axis was indexed by a int, so
++                    # dimension was reduced by 1,
++                    # need to ignore this axis, just check index match or not
++                    if list(index_map.keys())[0] == global_index:  # false if type or value doesn't match
++                        continue
++                    else:
++                        local_index = None
++                elif isinstance(index_map, tuple):
++                    if isinstance(global_index, int):  # int vs slice
++                        local_index = MultiDeviceBuffer._map_int_with_slice(global_index, index_map)
++                    elif isinstance(global_index, list):  # List[int] vs slice
++                        local_index = [MultiDeviceBuffer._map_int_with_slice(i, index_map) for i in global_index]
++
++                        # any index out of bound?
++                        if None in local_index:
++                            local_index = None
++                    elif isinstance(global_index, slice):  # slice vs slice
++                        # slice to tuple
++                        slice_tuple = global_index.indices(size)
++                        local_tuple = MultiDeviceBuffer._map_slice_with_slice(slice_tuple, index_map)
++                        if local_tuple is None:
++                            local_index = None
++                        else:
++                            local_index = slice(*local_tuple)
++                    else:
++                        raise IndexError(f"Unsupported slices type: {type(global_index)}")
++                else:  # Map is int or list<int>
++                    if isinstance(global_index, int):  # int vs int/list
++                        local_index = self._map_int_with_int_map(global_index, index_map)
++                    elif isinstance(global_index, list):  # list vs int/list
++                        local_index = [self._map_int_with_int_map(i, index_map) for i in global_index]
++
++                        if None in local_index:
++                            local_index = None
++                    elif isinstance(global_index, slice):  # slice vs int/list
++                        # slice to tuple
++                        slice_tuple = global_index.indices(size)
++                        local_index = [self._map_int_with_int_map(i, index_map) for i in range(*slice_tuple)]
++
++                        if None in local_index:
++                            local_index = None
++                    else:
++                        raise IndexError(f"Unsupported slices type {type(global_index)}")
++
++                # if None, it means index out of range at this axis
++                if local_index is None:
++                    # check next copy
++                    local_slices = None
++                    break
++
++                local_slices.append(local_index)
++
++            if local_slices is None:  # result is not found for this subarray
++                if subarray_index == len(self._indices_map[device_id]) - 1:  # this is the last subarray
++                    local_slices = None  # non slices is found
++                else: # check next subarray
++                    local_slices = []  # clear intermidate result
++            else:
++                final_subarray_index = subarray_index
++                break
++
++        if local_slices is None:
++            raise IndexError(f"index out of range, index:{global_slices}")
++        elif not_tuple:
++            if len(local_slices) == 0:  # only be possible when special case int vs int exists and all axis are ignored
++                return final_subarray_index, slice(None, None, None)
++            else:
++                return final_subarray_index, local_slices[0]
++        else:
++            return final_subarray_index, tuple(local_slices)
++
++    def set_slices_mapping(self, device_id: int, global_slices: SlicesType):
++        """
++        set a global slices to local slices mapping wrt buffer at the device.
++
++        Raise error if `global_slices` is higher dim than shape
++        Note: this call doesn't check slice is within range, if it is not in range
++              exception will be trigger later when trying to index into the copy
++        """
++        if not isinstance(global_slices, tuple):  # if not a tuple, make it a tuple
++            global_slices = tuple([global_slices])
++
++        if len(self.shape) < len(global_slices):
++            raise IndexError(f"index out of range, index:{global_slices}")
++
++        slices_map_list = []
++        for d in range(len(global_slices)):
++            size = self.shape[d]  # number of entries at this axis
++            global_slice = global_slices[d]
++
++            if isinstance(global_slice, int):  # a single integer
++                slice_map = {global_slice: 0}
++            elif isinstance(global_slice, list):  # a list of integer
++                slice_map = {global_slice[i]: i for i in range(len(global_slice))}
++            elif isinstance(global_slice, slice):  # slice
++                # save slice as a tuple
++                # None in slice will be instantiated by concrete values
++                slice_map = global_slice.indices(size)
++            else:
++                raise IndexError(f"Unsupported slices type {type(global_slice)}")
++            slices_map_list.append(slice_map)
++
++        if self._indices_map[device_id] is None:
++            self._indices_map[device_id] = [slices_map_list]
++        else:
++            self._indices_map[device_id].append(slices_map_list)
++
++    def get_by_global_slices(self, device_id: int, global_slices: SlicesType):
++        """
++        Indexing/Slicing the buffer by `global_slices`.
++
++        `global_slices` will be first converted into local slices
++
++        Args:
++            device_id: gpu device_id or CPU_INDEX
++            global_slices: slice/ints/tuple/list<int>, use the same format as advance indexing of numpy
++
++        Return
++            :class:`cupy.ndarray` or :class:`numpy.array` object or `None` if there is no copy at that device
++        """
++        # check if a copy exists at this device
++        #
++        # This is needed for the usage of @spawn() annotation
++        # sometimes device has no copy but still need a parray slices view object in advance
++        # Example:
++        # A = parray(numpy_array)
++        # @spawn(inout=[A], placement=gpu(0))
++        # def task1():
++        #   ... a task move A to GPU. free CPU copy ...
++        # @spawn(inout=[A[0]], placement=gpu(1))
++        # def task2():
++        #   error: cpu's copy is freed -> `inout=[A[0]]` trigger an exception,
++        #   since it try a slice A at CPU (spawn() itself is happened at outer CPU task)
++        if self._buffer[device_id] is None:
++            return None
++
++        # check if there is a mapping
++        if self._indices_map[device_id] is None:
++            return self._buffer[device_id].__getitem__(global_slices)
++        else:
++            # map global slices to local slices
++            subarray_index, local_slices = self.map_local_slices(device_id, global_slices)
++            return self._buffer[device_id][subarray_index].__getitem__(local_slices)
++
++    def set_by_global_slices(self, device_id: int, global_slices: SlicesType, value: ndarray | Any):
++        """
++        Indexing/Slicing the buffer by `global_slices` and set value.
++
++        `global_slices` will be first converted into local slices
++
++        Args:
++            device_id: gpu device_id or CPU_INDEX
++            global_slices: slice/ints/tuple/list<int>, use the same format as advance indexing of numpy
++            value: the data to set
++
++        Return
++            :class:`cupy.ndarray` or :class:`numpy.array` object
++        """
++        # check if there is a mapping
++        if self._indices_map[device_id] is None:
++            self._buffer[device_id].__setitem__(global_slices, value)
++        else:
++            # map global slices to local slices
++            subarray_index, local_slices = self.map_local_slices(device_id, global_slices)
++            self._buffer[device_id][subarray_index].__setitem__(local_slices, value)
++
++
++    def _move_data(self, copy_func, dst: int, src: int, subarray_index: int, dst_slices: SlicesType, src_slices: SlicesType, dst_is_current_device:bool = True):
++        """
++        Helper function for copy_data_between_device
++        """
++        if dst_is_current_device:
++            if dst_slices is None and src_slices is None:  # Complete to Complete
++                self._buffer[dst] = copy_func(self._buffer[src])
++            elif dst_slices is None and src_slices is not None:  # Incomplete to Complete
++                self._buffer[dst][src_slices] = copy_func(self._buffer[src][subarray_index])
++            elif dst_slices is not None and src_slices is None:  # Complete to incomplete
++                if self._buffer[dst] is None:
++                    self._buffer[dst] = []
++                self._buffer[dst].append(copy_func(self._buffer[src][dst_slices]))
++            else:  # incomplete to incomplete
++                raise ValueError("Copy from subarray to subarray is unsupported")
++        else:
++            with cupy.cuda.Device(dst):  # switch device
++                if dst_slices is None and src_slices is None:  # Complete to Complete
++                    self._buffer[dst] = copy_func(self._buffer[src])
++                elif dst_slices is None and src_slices is not None:  # Incomplete to Complete
++                    self._buffer[dst][src_slices] = copy_func(self._buffer[src][subarray_index])
++                elif dst_slices is not None and src_slices is None:  # Complete to incomplete
++                    if self._buffer[dst] is None:
++                        self._buffer[dst] = []
++                    self._buffer[dst].append(copy_func(self._buffer[src][dst_slices]))
++                else:  # incomplete to incomplete
++                    raise ValueError("Copy from subarray to subarray is unsupported")
++
++    def copy_data_between_device(self, dst: int, src: int, dst_is_current_device: bool = True) -> None:
++        """
++        Copy data from src to dst.
++
++        dst is current device if `dst_is_current_device` is True
++        """
++        # a function to copy data between GPU devices async
++        def copy_from_device_async(src):
++            dst_data = cupy.empty_like(src)
++            dst_data.data.copy_from_device_async(src.data, src.nbytes)
++            return dst_data
++
++        if self._indices_map[src] is None:
++            src_slices_list = [None]
++        else:
++            src_slices_list = [self.get_global_slices(src, i) for i in range(len(self._indices_map[src]))]
++
++        # TRICK: if there are multiple subarray in this device, always pick the last one
++        # this is because load of data always comes together with create indices mapping
++        # so the indices mapping will put at the end of self._indices_map
++        dst_slices = self.get_global_slices(dst, -1)
++
++        for subarray_index in range(len(src_slices_list)):
++            src_slices = src_slices_list[subarray_index]
++            if src == CPU_INDEX:  # copy from CPU to GPU
++                self._move_data(cupy.asarray, dst, src, subarray_index, dst_slices, src_slices, dst_is_current_device)
++            elif dst != CPU_INDEX:  # copy from GPU to GPU
++                self._move_data(copy_from_device_async, dst, src, subarray_index, dst_slices, src_slices, dst_is_current_device)
++            else:  # copy from GPU to CPU
++                self._move_data(cupy.asnumpy, dst, src, subarray_index, dst_slices, src_slices)  # dst_is_current_device is no need if dst is CPU
++        self._cyparray_state.set_exist_on_device(dst, True)
++
++    def get_slices_hash(self, global_slices: SlicesType) -> int:
++        """
++        Get hash value of a slices of complete array.
++
++        This could be done by replaing list and slice to tuple
++        """
++        # little chance to have collision, but what if it happened?
++        hash_value = 17 # use a none zero hash value, so hash(0) != 0
++        prime = 31
++        if not isinstance(global_slices, tuple):
++            if isinstance(global_slices, list):
++                # Built-int hash() method might return negtive value.
++                # c_size_t is to ensure it is not negative
++                hash_value = hash_value * prime + ctypes.c_size_t(hash(tuple(global_slices))).value
++            elif isinstance(global_slices, slice):
++                hash_value = hash_value * prime + ctypes.c_size_t(hash(global_slices.indices(self.shape[0]))).value
++            else:
++                hash_value = hash_value * prime + ctypes.c_size_t(hash(global_slices)).value
++        else:
++            if len(self.shape) < len(global_slices):
++                raise IndexError(f"index out of range, index:{global_slices}")
++
++            for d in range(len(global_slices)):
++                index = global_slices[d]
++                if isinstance(index, list):
++                    hash_value = hash_value * prime + ctypes.c_size_t(hash(tuple(index))).value
++                elif isinstance(index, slice):
++                    hash_value = hash_value * prime + ctypes.c_size_t(hash(index.indices(self.shape[d]))).value
++                else:
++                    hash_value = hash_value * prime + ctypes.c_size_t(hash(index)).value
++
++        return hash_value
++
++    def __str__(self):
++        return str(self._buffer)
++
++    def __contains__(self, device_id):
++        """
++        Return True if there is a copy in this device
++        """
++        return device_id in self._buffer and self._buffer[device_id] is not None
++
++    def clear(self, device_id) -> None:
++        """
++        Clear data in device_id
++        """
++        import gc
++        mempool = cupy.get_default_memory_pool()
++        pinned_mempool = cupy.get_default_pinned_memory_pool()
++        print("inside evict():", mempool.used_bytes(), flush=True)
++        pritn("total:", mempool.total_bytes())
++        print("device id:", device_id, flush=True)
++#print("inside pinned evict():", pinned_mempool.used_bytes(), flush=True)
++        print("evicting array size:", self._buffer[device_id].nbytes)
++        print("type:", type(self._buffer[device_id]))
++        del self._buffer[device_id]
++        print("After inside evict():", mempool.used_bytes(), flush=True)
++#print("After inside pinned evict():", pinned_mempool.used_bytes(), flush=True)
++        self._indices_map[device_id] = None
++        self._buffer[device_id] = None
++        self._cyparray_state.set_exist_on_device(device_id, False)
+diff --git a/src/python/parla/common/parray/core.py b/src/python/parla/common/parray/core.py
+index e25396c..8d89075 100644
+--- a/src/python/parla/common/parray/core.py
++++ b/src/python/parla/common/parray/core.py
+@@ -120,6 +120,8 @@ class PArray:
+         # Note(@dialecticDolt):It should be valid to create PArrays outside of a scheduler context!!
+         # FIXME
+ 
++        # Register this PArray to tracker and make a link between
++        # C PArray instance.
+         scheduler = get_scheduler()
+         if scheduler is None:
+             raise NotImplementedError(
+@@ -129,6 +131,7 @@ class PArray:
+         num_devices = len(scheduler.device_manager.get_all_devices())
+         self._cy_parray = CyPArray(
+             self, self.ID, self.parent_ID, self.parent, self._cyparray_state, num_devices)
++        # record the size in Cython PArray
+         self._cy_parray.set_size(self.subarray_nbytes)
+ 
+         target_dev_id = - \
+@@ -301,7 +304,7 @@ class PArray:
+               f"Parent_ID: {self.parent_ID if self.ID != self.parent_ID else None}, "
+               f"Slice: {self._slices[0] if self.ID != self.parent_ID else None}, "
+               f"Bytes: {self.subarray_nbytes}, "
+-              f"Owner: {'GPU ' + str(self._coherence.owner) if self._coherence.owner != CPU_INDEX else 'CPU'}")
++              f"Owner: {'GPU ' + str(self._coherence.owner) if self._coherence.owner != CPU_INDEX else 'CPU'}", flush=True)
+         for device_id, state in self._coherence._local_states.items():
+             if device_id == CPU_INDEX:
+                 device_name = "CPU"
+@@ -311,13 +314,13 @@ class PArray:
+ 
+             if isinstance(state, dict):
+                 print(
+-                    f"state: {[state_str_map[s] for s in list(state.values())]}, including sliced copy:  # states of slices is unordered wrt the below slices")
++                    f"state: {[state_str_map[s] for s in list(state.values())]}, including sliced copy:  # states of slices is unordered wrt the below slices", flush=True)
+                 for slice, slice_id in zip(self._array._indices_map[device_id], range(len(self._array._indices_map[device_id]))):
+                     print(
+-                        f"\tslice {slice_id} - indices: {slice}, bytes: {self._array._buffer[device_id][slice_id].nbytes}")
++                        f"\tslice {slice_id} - indices: {slice}, bytes: {self._array._buffer[device_id][slice_id].nbytes}", flush=True)
+             else:
+-                print(f"state: {state_str_map[state]}")
+-        print("---End of Overview")
++                print(f"state: {state_str_map[state]}", flush=True)
++        print("---End of Overview", flush=True)
+ 
+     # slicing/indexing
+ 
+@@ -395,7 +398,7 @@ class PArray:
+ 
+         with self._coherence_cv[device_id]:
+             operations = self._coherence.evict(device_id, keep_one_copy)
+-            if operations[0].inst == MemoryOperation.ERROR:
++            if len(operations) != 0 and operations[0].inst == MemoryOperation.ERROR:
+                 return False  # cannot perform the eviction
+             self._process_operations(operations)
+ 
+@@ -479,11 +482,21 @@ class PArray:
+                 #     f"Evicting {self.name} from {op.src}, size: {to_free} bytes", flush=True)
+ 
+                 scheduler = get_scheduler()
+-                if (to_free > 0) and (scheduler is not None):
+-                    # This frees the memory on the device in the mapped and reserved pools
+-                    scheduler.device_manager.free_memory(op.src, to_free)
+-                    # TODO(wlr): This is only for explictly evicted PArrays. PArrays that fall out of scope need to be freed as well.
+-
++                if scheduler is not None:
++                    if to_free > 0:
++                        # This frees the memory on the device in the mapped and reserved pools
++                        scheduler.device_manager.free_memory(op.src, to_free)
++                        # TODO(wlr): This is only for explictly evicted PArrays. PArrays that fall out of scope need to be freed as well.
++                    src_global_dev_id =
++                        scheduler.device_manager.parrayid_to_globalid(op.src)
++                    if self._cy_parray.get_num_referring_tasks(src_global_dev_id) == 0:
++                        # If none of active tasks refers this PArray,
++                        # remove this PArray on the src device from
++                        # the PArray tracker's table.
++                        scheduler.remove_parray_from_tracker(
++                            self._cy_parray, src_global_dev_id)
++                # decrement the reference counter, relying on GC to free the memor
++                self._array.clear(op.src)
+             elif op.inst == MemoryOperation.ERROR:
+                 raise RuntimeError(
+                     "PArray gets an error from coherence protocol")
+@@ -906,5 +919,11 @@ class PArray:
+     def get_parray_parentid_from_cpp(self):
+         return self._cy_parray.get_parray_parentid()
+ 
+-    def get_num_active_tasks(self, global_dev_id):
+-        return self._cy_parray.get_num_active_tasks(global_dev_id)
++    def get_num_referring_tasks(self, global_dev_id):
++        return self._cy_parray.get_num_referring_tasks(global_dev_id)
++
++    def __del__(self):
++        # Users can explicitly call `del` over a Python PArray.
++        # In this case, detroy its array instance.
++        # TODO(hc): This code is not tested yet
++        self._array = None
+diff --git a/src/python/parla/common/parray/memory.py b/src/python/parla/common/parray/memory.py
+index 05cef02..89bf74c 100644
+--- a/src/python/parla/common/parray/memory.py
++++ b/src/python/parla/common/parray/memory.py
+@@ -573,3 +573,9 @@ class MultiDeviceBuffer:
+         self._buffer[device_id] = None
+         self._cyparray_state.set_exist_on_device(device_id, False)
+         return to_free
++
++    def __del__(self):
++        for i in range(0, len(self._buffer)):
++            self._cyparray_state.set_exist_on_device(i, False)
++        self._indices_map = None
++        self._buffer = None
+diff --git a/src/python/parla/cython/CMakeLists.txt b/src/python/parla/cython/CMakeLists.txt
+index 431fc58..7378df9 100644
+--- a/src/python/parla/cython/CMakeLists.txt
++++ b/src/python/parla/cython/CMakeLists.txt
+@@ -14,6 +14,7 @@ add_cython_target(tasks tasks.pyx CXX PY3)
+ add_cython_target(scheduler scheduler.pyx CXX PY3)
+ add_cython_target(device device.pyx CXX PY3)
+ add_cython_target(device_manager device_manager.pyx CXX PY3)
++add_cython_target(mm mm.pyx CXX PY3)
+ add_cython_target(cyparray_state cyparray_state.pyx CXX PY3)
+ add_cython_target(cyparray cyparray.pyx CXX PY3)
+ add_cython_target(variants variants.pyx CXX PY3)
+@@ -23,6 +24,7 @@ add_library(tasks MODULE ${tasks})
+ add_library(scheduler MODULE ${scheduler})
+ add_library(device MODULE ${device})
+ add_library(device_manager MODULE ${device_manager})
++add_library(mm MODULE ${mm})
+ add_library(cyparray_state MODULE ${cyparray_state})
+ add_library(cyparray MODULE ${cyparray})
+ add_library(variants MODULE ${variants})
+@@ -37,6 +39,7 @@ install(TARGETS tasks LIBRARY DESTINATION ${PYTHON_RELATIVE_SITE_PACKAGES_DIR}/p
+ install(TARGETS scheduler LIBRARY DESTINATION ${PYTHON_RELATIVE_SITE_PACKAGES_DIR}/parla/cython/)
+ install(TARGETS device LIBRARY DESTINATION ${PYTHON_RELATIVE_SITE_PACKAGES_DIR}/parla/cython/)
+ install(TARGETS device_manager LIBRARY DESTINATION ${PYTHON_RELATIVE_SITE_PACKAGES_DIR}/parla/cython/)
++install(TARGETS mm LIBRARY DESTINATION ${PYTHON_RELATIVE_SITE_PACKAGES_DIR}/parla/cython/)
+ install(TARGETS cyparray_state LIBRARY DESTINATION ${PYTHON_RELATIVE_SITE_PACKAGES_DIR}/parla/cython/)
+ install(TARGETS cyparray LIBRARY DESTINATION ${PYTHON_RELATIVE_SITE_PACKAGES_DIR}/parla/cython/)
+ install(TARGETS variants LIBRARY DESTINATION ${PYTHON_RELATIVE_SITE_PACKAGES_DIR}/parla/cython/)
+@@ -89,6 +92,14 @@ target_include_directories(device_manager PUBLIC ${NumPy_INCLUDE_DIRS})
+ target_include_directories(device_manager PUBLIC ${PYTHON_INCLUDE_DIRS})
+ target_include_directories(device_manager PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
+ 
++target_link_libraries(mm ${PYTHON_LIBRARIES})
++target_link_libraries(mm backend)
++target_include_directories(mm PUBLIC ${PROJECT_SOURCE_DIR}/src/c/backend)
++target_include_directories(mm PUBLIC ${PROJECT_SOURCE_DIR}/src/c/backend/include)
++target_include_directories(mm PUBLIC ${NumPy_INCLUDE_DIRS})
++target_include_directories(mm PUBLIC ${PYTHON_INCLUDE_DIRS})
++target_include_directories(mm PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
++
+ target_link_libraries(cyparray_state ${PYTHON_LIBRARIES})
+ target_link_libraries(cyparray_state backend)
+ target_include_directories(cyparray_state PUBLIC ${PROJECT_SOURCE_DIR}/src/c/backend)
+@@ -119,6 +130,7 @@ python_extension_module(tasks)
+ python_extension_module(scheduler)
+ python_extension_module(device)
+ python_extension_module(device_manager)
++python_extension_module(mm)
+ python_extension_module(cyparray_state)
+ python_extension_module(cyparray)
+ python_extension_module(variants)
+diff --git a/src/python/parla/cython/core.pxd b/src/python/parla/cython/core.pxd
+index 9643d9f..76a3f75 100644
+--- a/src/python/parla/cython/core.pxd
++++ b/src/python/parla/cython/core.pxd
+@@ -4,6 +4,7 @@ cimport cython
+ from parla.cython.device_manager cimport DeviceManager
+ from parla.cython.device cimport Device, CyDevice
+ from parla.cython.cyparray cimport InnerPArray
++from parla.cython.mm cimport LRUGlobalEvictionManager
+ 
+ from libc.stdint cimport uint32_t, uint64_t, int64_t
+ from libcpp  cimport bool
+@@ -22,6 +23,7 @@ cdef extern from "include/gpu_utility.hpp" nogil:
+     void gpu_busy_sleep(const int device, const unsigned long cycles,
+                     uintptr_t stream_ptr)
+ 
++
+ cdef extern from "include/runtime.hpp" nogil:
+     ctypedef void (*launchfunc_t)(void* py_scheduler, void* py_task, void* py_worker)
+     ctypedef void (*stopfunc_t)(void*)
+@@ -130,15 +132,19 @@ cdef extern from "include/runtime.hpp" nogil:
+ 
+         bool should_run
+         
+-        InnerScheduler(DeviceManager* cpp_device_manager)
++        InnerScheduler(LRUGlobalEvictionManager* cpp_memory_manager,  DeviceManager* cpp_device_manager)
+ 
+         void set_num_workers(int num_workers)
+         void set_py_scheduler(void* py_scheduler)
+         void set_stop_callback(stopfunc_t func)
+ 
++        bool get_should_run()
++
+         void run() except +
+         void stop()
+ 
++        long long int get_memory_size_to_evict(int dev_id) except +
++
+         void activate_wrapper()
+ 
+         void spawn_task(InnerTask* task)
+@@ -149,7 +155,6 @@ cdef extern from "include/runtime.hpp" nogil:
+         void task_cleanup_presync(InnerWorker* worker, InnerTask* task, int state) except +
+         void task_cleanup_postsync(InnerWorker* worker, InnerTask* task, int state) except +
+ 
+-        int get_num_active_tasks()
+         void increase_num_active_tasks()
+         void decrease_num_active_tasks()
+ 
+@@ -168,7 +173,10 @@ cdef extern from "include/runtime.hpp" nogil:
+         void spawn_wait()
+ 
+         void create_parray(InnerPArray* parray, int parray_dev_id)
++        void remove_parray_from_tracker(InnerPArray* parray, int dev_id)
+         
++        void invoke_all_cparrays_clear()
++        bool get_all_pyparrays_clear_flag()
+ 
+ 
+ cdef extern from "include/profiling.hpp" nogil:
+diff --git a/src/python/parla/cython/core.pyx b/src/python/parla/cython/core.pyx
+index d2ac9f9..68891c5 100644
+--- a/src/python/parla/cython/core.pyx
++++ b/src/python/parla/cython/core.pyx
+@@ -12,6 +12,7 @@ from parla.common.globals import AccessMode
+ from parla.cython.device cimport Device
+ from parla.cython.cyparray cimport CyPArray
+ from parla.cython.device_manager cimport CyDeviceManager, DeviceManager
++from parla.cython.mm cimport CyMM
+ import threading
+ from enum import IntEnum, auto
+ from parla.common.globals import cupy
+@@ -585,14 +586,15 @@ cdef class PyInnerWorker:
+ cdef class PyInnerScheduler:
+     cdef InnerScheduler* inner_scheduler
+ 
+-    def __cinit__(self, CyDeviceManager cy_device_manager, int num_workers, float vcus, object python_scheduler):
++    def __cinit__(self, CyMM cy_memory_manager, CyDeviceManager cy_device_manager, int num_workers, float vcus, object python_scheduler):
+         cdef InnerScheduler* _inner_scheduler
+         cdef DeviceManager* _cpp_device_manager = <DeviceManager*> cy_device_manager.get_cpp_device_manager()
++        cdef LRUGlobalEvictionManager* _cpp_memory_manager = <LRUGlobalEvictionManager*> cy_memory_manager.get_cpp_memory_manager()
+ 
+-        _inner_scheduler = new InnerScheduler(_cpp_device_manager)
++        _inner_scheduler = new InnerScheduler(_cpp_memory_manager, _cpp_device_manager)
+         self.inner_scheduler = _inner_scheduler
+ 
+-    def __init__(self, CyDeviceManager cy_device_manager, int num_workers, float vcus, object python_scheduler):
++    def __init__(self, CyMM cy_memory_manager, CyDeviceManager cy_device_manager, int num_workers, float vcus, object python_scheduler):
+         cdef InnerScheduler* _inner_scheduler
+         _inner_scheduler = self.inner_scheduler
+ 
+@@ -610,6 +612,14 @@ cdef class PyInnerScheduler:
+     def __dealloc__(self):
+         del self.inner_scheduler
+ 
++    cpdef get_should_run(self):
++        """
++        This function checks whether there are remaining tasks
++        in C scheduler queues.
++        """
++        cdef InnerScheduler* c_self = self.inner_scheduler
++        return c_self.get_should_run()
++
+     cpdef run(self):
+         cdef InnerScheduler* c_self = self.inner_scheduler
+         with nogil:
+@@ -661,10 +671,6 @@ cdef class PyInnerScheduler:
+         with nogil:
+             c_self.task_cleanup_postsync(c_worker, c_task, state)
+ 
+-    cpdef get_num_active_tasks(self):
+-        cdef InnerScheduler* c_self = self.inner_scheduler
+-        return c_self.get_num_active_tasks()
+-
+     cpdef increase_num_active_tasks(self):
+         cdef InnerScheduler* c_self = self.inner_scheduler
+         c_self.increase_num_active_tasks()
+@@ -720,6 +726,18 @@ cdef class PyInnerScheduler:
+         cdef InnerScheduler* c_self = self.inner_scheduler
+         return c_self.get_reserved_memory(global_dev_id)
+ 
++    cpdef get_memory_size_to_evict(self, int global_dev_id):
++        cdef InnerScheduler* c_self = self.inner_scheduler
++        return c_self.get_memory_size_to_evict(global_dev_id)
++
++    cpdef invoke_all_cparrays_clear(self):
++        cdef InnerScheduler* c_self = self.inner_scheduler
++        c_self.invoke_all_cparrays_clear()
++
++    cpdef get_all_pyparrays_clear_flag(self):
++        cdef InnerScheduler* c_self = self.inner_scheduler
++        return c_self.get_all_pyparrays_clear_flag()
++
+ 
+ class Resources:
+ 
+diff --git a/src/python/parla/cython/cyparray.pxd b/src/python/parla/cython/cyparray.pxd
+index 73223bc..06e7422 100644
+--- a/src/python/parla/cython/cyparray.pxd
++++ b/src/python/parla/cython/cyparray.pxd
+@@ -12,12 +12,12 @@ cdef extern from "include/parray.hpp" namespace "parray":
+     cdef cppclass InnerPArray:
+         InnerPArray(void *, uint64_t, uint64_t, InnerPArray *, PArrayState *i, uint32_t) except +
+         void set_size(uint64_t)
+-        uint64_t get_num_active_tasks(uint32_t global_dev_id) except +
++        uint64_t get_num_referring_tasks(uint32_t global_dev_id) except +
+         const uint64_t get_parent_id() except +
+ 
+ cdef class CyPArray:
+     # Hold a C++ instance which we're wrapping
+     cdef InnerPArray* cpp_parray
+     cdef InnerPArray* get_cpp_parray(self)
+-    cpdef get_num_active_tasks(self, int global_dev_id)
++    cpdef get_num_referring_tasks(self, int global_dev_id)
+     cpdef get_parray_parentid(self)
+diff --git a/src/python/parla/cython/cyparray.pyx b/src/python/parla/cython/cyparray.pyx
+index 19e9fee..ad13d6a 100644
+--- a/src/python/parla/cython/cyparray.pyx
++++ b/src/python/parla/cython/cyparray.pyx
+@@ -37,8 +37,8 @@ cdef class CyPArray:
+     cdef InnerPArray* get_cpp_parray(self):
+         return self.cpp_parray
+ 
+-    cpdef get_num_active_tasks(self, int global_dev_id):
+-        return self.cpp_parray.get_num_active_tasks(global_dev_id)
++    cpdef get_num_referring_tasks(self, int global_dev_id):
++        return self.cpp_parray.get_num_referring_tasks(global_dev_id)
+ 
+     cpdef get_parray_parentid(self):
+         return self.cpp_parray.get_parent_id()
+diff --git a/src/python/parla/cython/mm.pxd b/src/python/parla/cython/mm.pxd
+new file mode 100644
+index 0000000..8a8670a
+--- /dev/null
++++ b/src/python/parla/cython/mm.pxd
+@@ -0,0 +1,13 @@
++from parla.cython.device_manager cimport DeviceManager
++
++cdef extern from "include/memory_manager.hpp" nogil:
++    cdef cppclass LRUGlobalEvictionManager:
++        LRUGlobalEvictionManager(DeviceManager *)
++        unsigned long long size(unsigned int device_id)
++        void *remove_and_return_head_from_zrlist(unsigned int device_id)
++
++cdef class CyMM:
++    cdef LRUGlobalEvictionManager* _inner_mm
++    cpdef size(self, int dev_id)
++    cpdef remove_and_return_head_from_zrlist(self, int dev_id)
++    cdef LRUGlobalEvictionManager* get_cpp_memory_manager(self)
+diff --git a/src/python/parla/cython/mm.pyx b/src/python/parla/cython/mm.pyx
+new file mode 100644
+index 0000000..268e7cf
+--- /dev/null
++++ b/src/python/parla/cython/mm.pyx
+@@ -0,0 +1,62 @@
++
++from parla.cython import device_manager
++
++from parla.cython.core cimport LRUGlobalEvictionManager
++from parla.cython cimport device_manager
++#from parla.cython.core import LRUGlobalEvictionManager
++
++class PyMM:
++    def __init__(self, dm: device_manager.PyDeviceManager):
++        self._device_manager = device_manager
++        self._cy_mm = CyMM(dm.get_cy_device_manager())
++
++    def size(self, dev_id: int):
++        return self._cy_mm.size(dev_id)
++
++    def remove_and_return_head_from_zrlist(self, dev_id: int):
++        return self._cy_mm.remove_and_return_head_from_zrlist(dev_id)
++
++    def get_cy_memory_manager(self):
++        return self._cy_mm
++
++    def print_memory_stats(self, device_id, label: str):
++        import psutil
++        import os
++        print(f"[{label}] Memory tracking", flush=True)
++        try:
++            import cupy
++            mempool = cupy.get_default_memory_pool()
++            pinned_mempool = cupy.get_default_pinned_memory_pool()
++            print((
++                  f"\t GPU{device_id} {label} CuPy used bytes: {mempool.used_bytes()} \n"
++                  f"\t GPU{device_id} {label} Free bytes: {mempool.free_bytes()} \n"
++                  f"\t GPU{device_id} {label} Total bytes: {mempool.total_bytes()} \n"), flush=True)
++        except ImportError:
++            print("MM tracker only supports CuPy memory status checking.", flush=True)
++
++
++cdef class CyMM:
++
++    def __cinit__(self, device_manager.CyDeviceManager cy_dm):
++        self._inner_mm = new LRUGlobalEvictionManager(cy_dm.get_cpp_device_manager())
++
++    def __dealloc__(self):
++        del self._inner_mm
++
++    cpdef size(self, int dev_id):
++        cdef LRUGlobalEvictionManager* c_self = self._inner_mm
++        return c_self.size(dev_id)
++
++    cpdef remove_and_return_head_from_zrlist(self, int dev_id):
++        cdef LRUGlobalEvictionManager* c_self = self._inner_mm
++        cdef void* py_parray = c_self.remove_and_return_head_from_zrlist(dev_id)
++        if py_parray == NULL:
++            # TODO(hc): This path is actually not used.
++            # It would be great if we can check if this python object is valid
++            # at here; it can simplify our current mechanism a lot.
++            return None
++        else:
++            return <object> py_parray
++
++    cdef LRUGlobalEvictionManager* get_cpp_memory_manager(self):
++        return self._inner_mm
+diff --git a/src/python/parla/cython/scheduler.pyx b/src/python/parla/cython/scheduler.pyx
+index 3390240..db2caae 100644
+--- a/src/python/parla/cython/scheduler.pyx
++++ b/src/python/parla/cython/scheduler.pyx
+@@ -21,6 +21,7 @@ from parla.cython import tasks
+ cimport core
+ from parla.cython import core
+ from parla.cython.cyparray import CyPArray
++from parla.cython.mm import PyMM
+ 
+ from parla.common.globals import _Locals as Locals 
+ from parla.common.globals import USE_PYTHON_RUNAHEAD, _global_data_tasks, PREINIT_THREADS
+@@ -231,11 +232,11 @@ class WorkerThread(ControllableThread, SchedulerContext):
+                         #print("Setting environment for task", active_task, flush=True)
+                         active_task.environment = device_context
+ 
+-
+                         #Writes all 'default' streams and event pointers to c++ task
+                         #This allows their synchronization without the GIL and faster iteration over them
+                         #(only saves initial runtime ones, TODO(wlr): save any user added events or streams after body returns)
+                         device_context.write_to_task(active_task)
++
+                         #print("Wrote enviornment to task", active_task, flush=True)
+ 
+                         #handle event wait in python 
+@@ -294,6 +295,7 @@ class WorkerThread(ControllableThread, SchedulerContext):
+                         elif  isinstance(final_state, tasks.TaskRunahead):
+                             core.binlog_2("Worker", "Runahead task: ", active_task.inner_task, " on worker: ", self.inner_worker)
+                     
++                        #TODO(wlr): Add better exception handling
+                         #print("Cleaning up Task", active_task, flush=True)
+                         
+                         if USE_PYTHON_RUNAHEAD:
+@@ -314,13 +316,13 @@ class WorkerThread(ControllableThread, SchedulerContext):
+                         if isinstance(final_state, tasks.TaskRunahead):
+                             final_state = tasks.TaskCompleted(final_state.return_value)
+                             active_task.cleanup()
+-
+                             core.binlog_2("Worker", "Completed task: ", active_task.inner_task, " on worker: ", self.inner_worker)
+ 
+                         # print("Finished Task", active_task, flush=True)
+                         active_task.state = final_state
+                         self.task = None
+ 
++                        self.task = None
+                         nvtx.pop_range(domain="Python Runtime")
+                     elif self._should_run:
+                         raise WorkerThreadException("%r Worker: Woke without a task", self.index)
+@@ -350,7 +352,7 @@ class WorkerThread(ControllableThread, SchedulerContext):
+ 
+ class Scheduler(ControllableThread, SchedulerContext):
+ 
+-    def __init__(self, device_manager, n_threads=6, period=0.001):
++    def __init__(self, memory_manager, device_manager, n_threads=6, period=0.001):
+         super().__init__()
+ 
+         self.start_monitor = threading.Condition(threading.Lock())
+@@ -364,9 +366,28 @@ class Scheduler(ControllableThread, SchedulerContext):
+         #TODO: Handle resources better
+         resources = 1.0
+ 
++        self.memory_manager = memory_manager
+         self.device_manager = device_manager
++        cy_memory_manager = self.memory_manager.get_cy_memory_manager()
+         cy_device_manager = self.device_manager.get_cy_device_manager()
+-        self.inner_scheduler = PyInnerScheduler(cy_device_manager, n_threads, resources, self)
++        self.inner_scheduler = PyInnerScheduler(cy_memory_manager,
++                                                cy_device_manager,
++                                                n_threads,
++                                                resources, self)
++        # This holds PArray references.
++        # Through this, it makes a scheduler control a PArray's life cycle.
++        # For example, this holds the last reference of a PArray and so,
++        # scheduler (or memory manager) can have the control of thats
++        # deallocation.
++        # TODO(hc): However, for now, we only support reset this dictionary
++        #           not deallocating a single PArray.
++        #           For now, instead we deallocate PArray instance by
++        #           removing the internal array reference through evict().
++        #           In the future, we will have better design for this.
++        self.active_parrays = {}
++        # Worker threads and a scheduler both can access the active_parrays
++        # and so we need a lock to guard that.
++        self.active_parrays_monitor = threading.Condition(threading.Lock())
+ 
+         self.worker_threads = [WorkerThread(self, i) for i in range(n_threads)]
+ 
+@@ -382,6 +403,58 @@ class Scheduler(ControllableThread, SchedulerContext):
+     def scheduler(self):
+         return self
+ 
++    def append_active_parray(self, parray: PArray):
++        """ Append a PArray reference.
++
++            :param parray: PArray to be appended
++        """
++        with self.active_parrays_monitor:
++            self.active_parrays[parray.ID] = parray
++
++    def remove_active_parray(self, parray: PArray):
++        """ Remove a PArray reference.
++
++            :param parray: PArray to be removed 
++        """
++        with self.active_parrays_monitor:
++            self.active_parrays[parray.ID] = None
++
++    def clear_active_parrays(self):
++        """ Clear all references from active_parrays dictionary.
++        """
++        with self.active_parrays_monitor:
++            import cupy
++            for k in range(0, 4):
++                with cupy.cuda.Device(k):
++                    mempool = cupy.get_default_memory_pool()
++                    print(f"\t {k} Used: {mempool.used_bytes()}, Free: {mempool.free_bytes()}", flush=True)
++            # TODO(hc): This is unstable hack.
++            # This loop is necessary since a worker thread 
++            # can proceed with the next task graph
++            # (in case of benchmark/python/benchmark.py)
++            # and generate and allocate new data while a scheduler
++            # who is the caller of this function holds its reference.
++            # It seems like there is a concurrency issue; for example,
++            # removing active_parrays' reference is not immediately
++            # caught by a gc. The safest way that I am using is to evict
++            # each of PArrays at here.
++            # But still it is unstable way and I will replace this soon.
++            for k, v in self.active_parrays.items():
++                for dev in self.device_manager.get_all_devices():
++                    global_dev_id = dev.get_global_id()
++                    parray_dev_id = self.device_manager.globalid_to_parrayid(global_dev_id)
++                    v.evict(parray_dev_id)
++            # TODO(hc): I am not sure why the above loop is necessary..
++            #           But otherwise, it doesnt deallocate cupy arrays.
++            # TODO(hc): This is not considering concurrent execution between
++            #           a thread and a scheduler..
++            #           The main thread can allocate new data while or before the scheduler
++            #           deallocates the old PArrays through this.
++            for k in range(0, 4):
++                with cupy.cuda.Device(k):
++                    mempool = cupy.get_default_memory_pool()
++                    print(f"\t After {k} Used: {mempool.used_bytes()}, Free: {mempool.free_bytes()}", flush=True)
++
+     def get_device_reqs_from_placement(self, placement, vcus, memory):
+         return self.device_manager.get_device_reqs_from_placement(placement, vcus, memory)
+ 
+@@ -414,10 +487,73 @@ class Scheduler(ControllableThread, SchedulerContext):
+             pass
+             #print("Runtime Stopped", flush=True)
+ 
++    def parray_eviction(self):
++        py_mm = self.memory_manager
++        print("Eviction policy is activated")
++        for cuda_device in self.device_manager.get_devices(DeviceType.CUDA):
++            global_id = cuda_device.get_global_id()
++            parray_id = self.device_manager.globalid_to_parrayid(global_id)
++            # Get target memory size to evict from this device
++            memory_size_to_evict = \
++                self.inner_scheduler.get_memory_size_to_evict(global_id)
++            # Get the number of PArray candidates that are allowed to be evicted
++            # from Python eviction manager.
++            num_evictable_parray = py_mm.size(global_id)
++            # TODO(hc): remove this. this is for test.
++            import cupy
++            for i in range(0, num_evictable_parray):
++                try:
++                    # Get a PArray from a memory manager to evict.
++                    evictable_parray = \
++                        py_mm.remove_and_return_head_from_zrlist(global_id)
++                    if evictable_parray is not None:
++                        # TODO(hc): remove this. this is for test.
++                        for k in range(0, 4):
++                            with cupy.cuda.Device(k):
++                                mempool = cupy.get_default_memory_pool()
++                                print(f"\t OK? {k} Used GPU{k}: {mempool.used_bytes()}, Free Mmeory: {mempool.free_bytes()}", flush=True) 
++
++                        evictable_parray.evict(parray_id)
++
++                        # TODO(hc): remove this. this is for test.
++                        for k in range(0, 4):
++                            with cupy.cuda.Device(k):
++                                mempool = cupy.get_default_memory_pool()
++                                print(f"\t OK {k} Used GPU{k}: {mempool.used_bytes()}, Free Mmeory: {mempool.free_bytes()}", flush=True) 
++
++                        # Repeat eviction until it gets enough memory.
++                        memory_size_to_evict -= \
++                            evictable_parray.nbytes_at(parray_id)
++                        if memory_size_to_evict <= 0:
++                            break
++                except Exception as e:
++                    print("Failed to find parray evictable", flush=True)
++        return
++
+     def run(self):
+-        #print("Scheduler: Running", flush=True)
+-        self.inner_scheduler.run()
+-        #print("Scheduler: Stopped Loop", flush=True)
++        with self:
++            while True:
++                print("Scheduler: Running", flush=True)
++                self.inner_scheduler.run()
++                if self.inner_scheduler.get_all_pyparrays_clear_flag():
++                    # All the references of the PArrays held by
++                    # a Python scheduler should be destroyed 
++                    # AFTER C++ scheduler (or memory manager) clears
++                    # all PArray nodes from the list in the eviction manager.
++                    # TODO(hc): rename the function to is_cparrays_cleared.
++                    # TODO(hc): this might be done by a worker thread
++                    #           who will allocate PArrays.
++                    #           Otherwise, there should be thread concurrent
++                    #           over memory allocation/deallocation.
++                    self.clear_active_parrays()
++                else:
++                    should_run = self.inner_scheduler.get_should_run()
++                    if should_run == False:
++                        break
++                    # This case is executed if PArray eviction
++                    # mechanism was invoked by C++ scheduler.
++                    self.parray_eviction() 
++            self.stop_callback()
+ 
+     def stop(self):
+         #print("Scheduler: Stopping (Called from Python)", flush=True)
+@@ -437,7 +573,6 @@ class Scheduler(ControllableThread, SchedulerContext):
+     def spawn_task(self, task):
+         #print("Scheduler: Spawning Task", task, flush=True)
+         self.inner_scheduler.spawn_task(task.inner_task)
+-
+     
+     def assign_task(self, task, worker):
+         task.state = tasks.TaskRunning(task.func, task.args, task.dependencies)
+@@ -449,7 +584,6 @@ class Scheduler(ControllableThread, SchedulerContext):
+     def spawn_wait(self):
+         self.inner_scheduler.spawn_wait()
+         
+-
+     def create_parray(self, cy_parray: CyPArray, parray_dev_id: int):
+         """
+         Reserve PArray instances that are created through
+@@ -514,6 +648,20 @@ class Scheduler(ControllableThread, SchedulerContext):
+         return self.inner_scheduler.get_reserved_parray_state( \
+             global_dev_id, parray_parent_id)
+ 
++    def remove_parray_from_tracker(\
++        self, cy_parray: CyPArray, did: int):
++        """
++        Remove the evicted PArray instance on device `global_dev_id`
++        from the PArray tracker's table
++
++        :param cy_parray: Cython PArray instance to be removed 
++        :param did: global logical device id where the PArray is evicted
++        """
++        self.inner_scheduler.remove_parray_from_tracker(cy_parray, did)
++
++    def invoke_all_cparrays_clear(self):
++        self.inner_scheduler.invoke_all_cparrays_clear()
++
+ 
+ def _task_callback(task, body):
+     """
+diff --git a/src/python/parla/cython/tasks.pyx b/src/python/parla/cython/tasks.pyx
+index 98bd759..bd71e44 100644
+--- a/src/python/parla/cython/tasks.pyx
++++ b/src/python/parla/cython/tasks.pyx
+@@ -21,6 +21,8 @@ from parla.common.globals import AccessMode, Storage
+ from parla.cython.cyparray import CyPArray
+ from parla.common.parray.core import PArray
+ from parla.common.globals import SynchronizationType as SyncType 
++from parla.common.globals import _global_data_tasks
++
+ 
+ PyDevice = device.PyDevice
+ PyCUDADevice = device.PyCUDADevice
+@@ -554,18 +556,25 @@ class Task:
+                 cy_parray = in_parray.cy_parray
+                 self.inner_task.add_parray(cy_parray,
+                     AccessMode.IN, in_parray_devid)
++                # Add a PArray reference to a dictionary in a scheduler
++                # to pass its lifecycle.
++                self.scheduler.append_active_parray(in_parray)
+             for out_parray_tpl in dataflow.output:
+                 out_parray = out_parray_tpl[0]
+                 out_parray_devid = out_parray_tpl[1]
+                 cy_parray = out_parray.cy_parray
+                 self.inner_task.add_parray(cy_parray,
+                     AccessMode.OUT, out_parray_devid)
++                self.scheduler.append_active_parray(out_parray)
+             for inout_parray_tpl in dataflow.inout:
+                 inout_parray = inout_parray_tpl[0]
+                 inout_parray_devid = inout_parray_tpl[1]
+                 cy_parray = inout_parray.cy_parray
+                 self.inner_task.add_parray(cy_parray,
+                     AccessMode.INOUT, inout_parray_devid)
++                # TODO(hc): Maybe we can pass dataflow to reduce
++                #           lock conflicts.
++                self.scheduler.append_active_parray(inout_parray)
+ 
+     def notify_dependents_wrapper(self):
+         """!
+@@ -713,6 +722,7 @@ class DataMovementTask(Task):
+         idx=0, state=TaskCreated(), scheduler=None, name=None):
+         super().__init__(taskspace, idx, state, scheduler, name)
+         self.parray = parray
++
+         self.access_mode = access_mode
+         self.assigned_devices = assigned_devices
+ 
+@@ -748,7 +758,6 @@ class DataMovementTask(Task):
+             self.parray._auto_move(device_manager.get_parray_id(global_device_id),
+                                    write_flag)
+         """
+-#self.parray._auto_move(device_manager.get_parray_id(self.dev_id), write_flag)
+         target_dev = self.assigned_devices[0]
+         global_id = target_dev.get_global_id()
+         parray_id = device_manager.globalid_to_parrayid(global_id)
+@@ -758,7 +767,8 @@ class DataMovementTask(Task):
+         return TaskRunahead(0)
+ 
+     def cleanup(self):
+-        pass
++        _global_data_tasks[id(self)] = None
++        self.parray = None
+ 
+ ######
+ # Task Environment
+@@ -1828,9 +1838,3 @@ class BackendTaskSpace(TaskSpace):
+ 
+     def wait(self):
+         self.inner_space.wait()
+-
+-
+-
+-
+-    
+-    
+diff --git a/src/python/parla/utility/execute.py b/src/python/parla/utility/execute.py
+index 36825aa..eadcaaf 100644
+--- a/src/python/parla/utility/execute.py
++++ b/src/python/parla/utility/execute.py
+@@ -261,15 +261,16 @@ def synthetic_kernel_gpu(total_time: int, gil_fraction: Union[Fraction, float],
+     return None
+ 
+ 
+-def create_task_no_data(task, taskspaces, config, data_list=None):
++def create_task_no_data(task, taskspaces, config, ts_postfix=None, data_list=None):
+ 
+     try:
+         # Task ID
+         task_idx = task.task_id.task_idx
+-        taskspace = taskspaces[task.task_id.taskspace]
++        ts_postfix_str = "" if ts_postfix is None else ts_postfix
++        taskspace = taskspaces[task.task_id.taskspace + ts_postfix_str]
+ 
+         # Dependency Info
+-        dependencies = [taskspaces[dep.taskspace][dep.task_idx]
++        dependencies = [taskspaces[dep.taskspace + ts_postfix_str][dep.task_idx]
+                         for dep in task.task_dependencies]
+ 
+         # Valid Placement Set
+@@ -326,15 +327,16 @@ def create_task_no_data(task, taskspaces, config, data_list=None):
+         return
+ 
+ 
+-def create_task_eager_data(task, taskspaces, config=None, data_list=None):
++def create_task_eager_data(task, taskspaces, config=None, ts_postfix=None, data_list=None):
+ 
+     try:
+         # Task ID
+         task_idx = task.task_id.task_idx
+-        taskspace = taskspaces[task.task_id.taskspace]
++        ts_postfix_str = "" if ts_postfix is None else ts_postfix
++        taskspace = taskspaces[task.task_id.taskspace + ts_postfix_str]
+ 
+         # Dependency Info
+-        dependencies = [taskspaces[dep.taskspace][dep.task_idx]
++        dependencies = [taskspaces[dep.taskspace + ts_postfix_str][dep.task_idx]
+                         for dep in task.task_dependencies]
+ 
+         # Valid Placement Set
+@@ -397,14 +399,15 @@ def create_task_eager_data(task, taskspaces, config=None, data_list=None):
+         if config.gil_fraction is not None:
+             gil_fraction = config.gil_fraction
+ 
+-        # print("Eager data in:", IN, " out:", OUT, " inout:", INOUT, flush=True)
+         """
++        print("Eager data in:", IN, " out:", OUT, " inout:", INOUT, flush=True)
+         print("task idx:", task_idx, " dependencies:", dependencies, " vcu:", device_fraction,
+             " placement:", placement_set)
+-        """
+-
+         # TODO(hc): Add data checking.
+-        @spawn(taskspace[task_idx], dependencies=dependencies, vcus=device_fraction, placement=[placement_set], input=IN, output=OUT, inout=INOUT)
++        print("Memory:", IN[0][0].nbytes)
++        """
++        # TODO(hc): remove memory operand
++        @spawn(taskspace[task_idx], dependencies=dependencies, vcus=device_fraction, placement=[placement_set], input=IN, output=OUT, inout=INOUT, memory=IN[0][0].nbytes)
+         async def task_func():
+             if config.verbose:
+                 print(f"+{task.task_id} Running", flush=True)
+@@ -421,15 +424,16 @@ def create_task_eager_data(task, taskspaces, config=None, data_list=None):
+         return
+ 
+ 
+-def create_task_lazy_data(task, taskspaces, config=None, data_list=None):
++def create_task_lazy_data(task, taskspaces, config=None, ts_postfix=None, data_list=None):
+ 
+     try:
+         # Task ID
+         task_idx = task.task_id.task_idx
+-        taskspace = taskspaces[task.task_id.taskspace]
++        ts_postfix_str = "" if ts_postfix is None else ts_postfix
++        taskspace = taskspaces[task.task_id.taskspace + ts_postfix_str]
+ 
+         # Dependency Info
+-        dependencies = [taskspaces[dep.taskspace][dep.task_idx]
++        dependencies = [taskspaces[dep.taskspace + ts_postfix_str][dep.task_idx]
+                         for dep in task.task_dependencies]
+ 
+         # Valid Placement Set
+@@ -483,8 +487,11 @@ def create_task_lazy_data(task, taskspaces, config=None, data_list=None):
+ 
+         if config.gil_fraction is not None:
+             gil_fraction = config.gil_fraction
++        """
+         print("task idx:", task_idx, " dependencies:", dependencies, " vcu:", device_fraction,
+               " placement:", placement_set)
++        
++        """
+ 
+         @spawn(taskspace[task_idx], dependencies=dependencies, vcus=device_fraction, placement=[placement_set])
+         async def task_func():
+@@ -520,7 +527,7 @@ def create_task_lazy_data(task, taskspaces, config=None, data_list=None):
+         return
+ 
+ 
+-def execute_tasks(taskspaces, tasks: Dict[TaskID, TaskInfo], run_config: RunConfig, data_list=None):
++def execute_tasks(taskspaces, tasks: Dict[TaskID, TaskInfo], run_config: RunConfig, ts_postfix=None, data_list=None):
+ 
+     spawn_start_t = time.perf_counter()
+ 
+@@ -530,32 +537,36 @@ def execute_tasks(taskspaces, tasks: Dict[TaskID, TaskInfo], run_config: RunConf
+         if run_config.movement_type == MovementType.NO_MOVEMENT:
+             # print("No data movement")
+             create_task_no_data(details, taskspaces,
+-                                config=run_config, data_list=data_list)
++                                config=run_config,
++                                ts_postfix=ts_postfix,
++                                data_list=data_list)
+         elif run_config.movement_type == MovementType.EAGER_MOVEMENT:
+             # print("Eager data movement")
+             create_task_eager_data(details, taskspaces,
+-                                   config=run_config, data_list=data_list)
++                                   config=run_config,
++                                   ts_postfix=ts_postfix,
++                                   data_list=data_list)
+         elif run_config.movement_type == MovementType.LAZY_MOVEMENT:
+             # print("Lazy data movement")
+             create_task_lazy_data(details, taskspaces,
+-                                  config=run_config, data_list=data_list)
+-
++                                  config=run_config,
++                                  ts_postfix=ts_postfix,
++                                  data_list=data_list)
+     spawn_end_t = time.perf_counter()
+ 
+     return taskspaces
+ 
+ 
+ def execute_graph(data_config: Dict[int, DataInfo], tasks: Dict[TaskID, TaskInfo], run_config: RunConfig, timing: List[TimeSample]):
+-
+     @spawn(vcus=0)
+     async def main_task():
+ 
+         graph_times = []
++        # Generate data once for multiple iterations.
++        data_list = generate_data(
++            data_config, run_config.data_scale, run_config.movement_type)
+ 
+         for i in range(run_config.inner_iterations):
+-            data_list = generate_data(
+-                data_config, run_config.data_scale, run_config.movement_type)
+-
+             # Initialize task spaces
+             taskspaces = {}
+ 
+@@ -570,10 +581,76 @@ def execute_graph(data_config: Dict[int, DataInfo], tasks: Dict[TaskID, TaskInfo
+ 
+             for taskspace in taskspaces.values():
+                 await taskspace
++            graph_end_t = time.perf_counter()
++
++            #worker_thread = get_scheduler_context()
++            #worker_thread.scheduler.invoke_all_cparrays_clear()
++
++            graph_elapsed = graph_end_t - graph_start_t
++            graph_times.append(graph_elapsed)
++
++        graph_times = np.asarray(graph_times)
++        graph_t = TimeSample(np.mean(graph_times), np.median(graph_times), np.std(
++            graph_times), np.min(graph_times), np.max(graph_times), len(graph_times))
++
++        timing.append(graph_t)
++
++
++def execute_graph_memory2(data_config: Dict[int, DataInfo], tasks: Dict[TaskID, TaskInfo], run_config: RunConfig, timing: List[TimeSample]):
++    """
++    This function creates data for each iteration and intentionally decreases
++    its reference count.
++    Data generated in the previous iteration becomes unnecessary and a
++    scheduler's eviction manager evicts those data to CPU.
++    This function requires the following consideration;
++    data should be less than CPU memory. So, the number of iterations or
++    PArray size should not be high.
++    TODO(hc): it might be a separate test.
++    """
++    @spawn(vcus=0)
++    async def main_task():
++
++        graph_times = []
++        data_list = []
++        max_memory = 0
++        for i in range(0, run_config.inner_iterations):
++            import cupy
++            # Initialize task spaces
++            taskspaces = {}
++
++            data_list.append(generate_data(data_config, run_config.data_scale, run_config.movement_type))
++            ts_postfix = "-"+str(i)
++            # Create a task space with the postfix of the current iteration.
++            for task, details in tasks.items():
++                space_name = details.task_id.taskspace
++                space_name += ts_postfix
++                if space_name not in taskspaces:
++                    taskspaces[space_name] = TaskSpace(space_name)
+ 
++            for k in range(0, 4):
++                with cupy.cuda.Device(k):
++                    mempool = cupy.get_default_memory_pool()
++                    print(f"\t Right Before {k} Used GPU{k}: {mempool.used_bytes()}, Free Mmeory: {mempool.free_bytes()}") 
++
++            graph_start_t = time.perf_counter()
++
++            execute_tasks(taskspaces, tasks, run_config, ts_postfix, data_list=data_list[i])
++
++            for taskspace in taskspaces.values():
++                await taskspace
++            taskspace = None
+             graph_end_t = time.perf_counter()
+ 
++            #worker_thread = get_scheduler_context()
++            #worker_thread.scheduler.invoke_all_cparrays_clear()
++
++            for k in range(0, 4):
++                with cupy.cuda.Device(k):
++                    mempool = cupy.get_default_memory_pool()
++                    print(f"\t Right After {k} Used GPU{k}: {mempool.used_bytes()}, Free Mmeory: {mempool.free_bytes()}") 
++
+             graph_elapsed = graph_end_t - graph_start_t
++            print("Iteration:", i, ", execution time:", graph_elapsed, flush=True)
+             graph_times.append(graph_elapsed)
+ 
+         graph_times = np.asarray(graph_times)
+@@ -597,7 +674,7 @@ def run(tasks: Dict[TaskID, TaskInfo], data_config: Dict[int, DataInfo] = None,
+ 
+         with Parla(logfile=run_config.logfile):
+             internal_start_t = time.perf_counter()
+-            execute_graph(data_config, tasks, run_config, timing)
++            execute_graph_memory2(data_config, tasks, run_config, timing)
+             internal_end_t = time.perf_counter()
+ 
+         outer_end_t = time.perf_counter()
+@@ -797,12 +874,14 @@ class GraphContext(object):
+         return self
+ 
+     def run(self, run_config: RunConfig, max_time: int = 100):
+-
++        return run(self.graph, self.data_config, run_config)
++        """
+         @timeout(max_time)
+         def run_with_timeout():
+             return run(self.graph, self.data_config, run_config)
+ 
+         return run_with_timeout()
++        """
+ 
+     def __exit__(self, type, value, traceback):
+         self.diro.__exit__(type, value, traceback)
diff --git a/src/python/parla/cython/tasks.pyx b/src/python/parla/cython/tasks.pyx
index 80be4101..1d8ee49b 100644
--- a/src/python/parla/cython/tasks.pyx
+++ b/src/python/parla/cython/tasks.pyx
@@ -721,6 +721,7 @@ class DataMovementTask(Task):
         global_id = target_dev.get_global_id()
         parray_id = device_manager.globalid_to_parrayid(global_id)
 
+        print(self.name, " moves parray:", parray_id, " on device:", target_dev)
         self.parray._auto_move(parray_id, write_flag)
         return TaskRunahead(0)
 
@@ -1787,9 +1788,3 @@ class BackendTaskSpace(TaskSpace):
 
     def wait(self):
         self.inner_space.wait()
-
-
-
-
-    
-    

From 22d80669c1f690542f702974a5cad2dd104a4761 Mon Sep 17 00:00:00 2001
From: Hochan Lee <hochan@utexas.edu>
Date: Sun, 14 Jan 2024 16:13:50 -0600
Subject: [PATCH 2/5] addcomments

---
 src/c/backend/include/runtime.hpp |  9 ++++++++-
 src/c/backend/phases.cpp          | 22 ++++++++++------------
 2 files changed, 18 insertions(+), 13 deletions(-)

diff --git a/src/c/backend/include/runtime.hpp b/src/c/backend/include/runtime.hpp
index 88e06018..d4ab7214 100644
--- a/src/c/backend/include/runtime.hpp
+++ b/src/c/backend/include/runtime.hpp
@@ -277,7 +277,7 @@ class InnerTask {
   /* Task Assigned Device Set*/
   std::vector<Device *> assigned_devices;
 
-  /*Resource Requirements for each assigned device*/
+  /* Resource Requirements for each assigned device*/
   std::unordered_map<int, ResourcePool_t> device_constraints;
 
   /* Task is data movement task */
@@ -293,6 +293,13 @@ class InnerTask {
   std::vector<std::vector<std::pair<parray::InnerPArray *, AccessMode>>>
       parray_list;
 
+  /* A list of dependency tasks of a parray for this task's dependent tasks.
+     To be specific, a task sets dependencies of a parray for dependent tasks.
+     If this task's access permission to a parray includes write, it sets
+     itself as the dependency of the parray.
+     If this task's access permission to the parray is read-only, it pulls
+     this list of the dependencies to this map.
+   */
   std::unordered_map<uint64_t, std::vector<InnerTask*>> parray_dependencies_map;
 
   InnerTask();
diff --git a/src/c/backend/phases.cpp b/src/c/backend/phases.cpp
index 2713e21f..ac48bca8 100644
--- a/src/c/backend/phases.cpp
+++ b/src/c/backend/phases.cpp
@@ -239,7 +239,8 @@ void MemoryReserver::create_datamove_tasks(InnerTask *task) {
 
 
 
-
+// TODO(hc): need to think about better naming before it is merged.
+// first, need peer-review on this.
 void MemoryReserver::create_datamove_tasks2(InnerTask *task) {
   // Get a list of the parrays the current task holds.
   const std::vector<std::vector<std::pair<parray::InnerPArray *, AccessMode>>>
@@ -258,28 +259,25 @@ void MemoryReserver::create_datamove_tasks2(InnerTask *task) {
           task_base_name + ".dm." + std::to_string(i), 0, parray, access_mode,
           i);
       uint64_t parray_parent_id = parray->get_parent_parray()->id;
-      // Find dependency intersection between compute and data movement tasks.
-
-      // TODO(hc): This is not the complete implementation.
-      //           We will use a concurrent map for parray's
-      //           task list as an optimization.
-
+      // Get dependencies
       std::vector<void *> compute_task_dependencies = task->get_dependencies();
       std::vector<InnerTask *> data_task_dependencies;
       for (size_t k = 0; k < compute_task_dependencies.size(); ++k) {
         InnerTask *parray_dependency =
             static_cast<InnerTask *>(compute_task_dependencies[k]);
+        // Get dependencies of a parray having `parray_parent_id` that have
+        // registered to the traversed dependency task
         std::vector<InnerTask*>& dep_parray_dependencies = 
             parray_dependency->get_parray_dependencies(parray_parent_id);
 
-        std::cout << parray_dependency->name << " is being traversed\n";
+        //std::cout << parray_dependency->name << " is being traversed\n";
         for (size_t t = 0; t < dep_parray_dependencies.size(); ++t) {
           data_task_dependencies.push_back(parray_dependency);
           // If the current processing parray's access mode is READ ONLY,
           // add this dependency as a dependency for this parray.
-          std::cout << "access mode:" << int(access_mode) << "\n";
+          //std::cout << "access mode:" << int(access_mode) << "\n";
           if (access_mode == AccessMode::IN) {
-            std::cout << "IN parray is added:" << parray_parent_id << "\n";
+            //std::cout << "IN parray is added:" << parray_parent_id << "\n";
             task->get_parray_dependencies(parray_parent_id).push_back(parray_dependency);
           }
         }
@@ -287,9 +285,9 @@ void MemoryReserver::create_datamove_tasks2(InnerTask *task) {
 
       // If the current processing parray's access mode is not READ ONLY,
       // add itself as a dependency for this parray.
-      std::cout << task->name << " is being traversed access id :" << int(access_mode) << "\n";
+      //std::cout << task->name << " is being traversed access id :" << int(access_mode) << "\n";
       if (access_mode != AccessMode::IN) {
-        std::cout << "IN/OUT OUT parray is added:" << parray_parent_id << "\n";
+        //std::cout << "IN/OUT OUT parray is added:" << parray_parent_id << "\n";
         task->get_parray_dependencies(parray_parent_id).push_back(task);
       }
 

From e0b22a77ee4c705ca85cbb627ef9f203416282a0 Mon Sep 17 00:00:00 2001
From: nicelhc13 <nicelhc13@users.noreply.github.com>
Date: Sun, 14 Jan 2024 22:18:16 +0000
Subject: [PATCH 3/5] style: apply ruff format

---
 src/python/parla/common/dataflow.py         |  2 +-
 src/python/parla/common/parray/coherence.py | 16 +++++++---------
 src/python/parla/common/parray/core.py      |  9 +++------
 src/python/parla/utility/graphs.py          |  6 +++---
 4 files changed, 14 insertions(+), 19 deletions(-)

diff --git a/src/python/parla/common/dataflow.py b/src/python/parla/common/dataflow.py
index 8bdfcf74..254d0b9f 100644
--- a/src/python/parla/common/dataflow.py
+++ b/src/python/parla/common/dataflow.py
@@ -103,7 +103,7 @@ def process_crosspys(
                                 _out.append((parray, i))
                 else:
                     raise TypeError(
-                       f"Invalid Type: {type(element)}. Dataflow should be PArray, CrossPyArray, or Tuple[PArray, int]"
+                        f"Invalid Type: {type(element)}. Dataflow should be PArray, CrossPyArray, or Tuple[PArray, int]"
                     )
         return _out
 
diff --git a/src/python/parla/common/parray/coherence.py b/src/python/parla/common/parray/coherence.py
index 62e103a8..ba2c546d 100644
--- a/src/python/parla/common/parray/coherence.py
+++ b/src/python/parla/common/parray/coherence.py
@@ -27,14 +27,14 @@ class MemoryOperation:
 
     # Flag
     SWITCH_DEVICE_FLAG = (
-        101
-    )  # if the flag is set, it means dst is not the current device
+        101  # if the flag is set, it means dst is not the current device
+    )
     LOAD_SUBARRAY = (
-        102
-    )  # if the flag is set, it means a subarray of src should be loaded
+        102  # if the flag is set, it means a subarray of src should be loaded
+    )
     ENSURE_IS_COMPLETE = (
-        103
-    )  # if the flag is set, check data will also check if the data is complete
+        103  # if the flag is set, check data will also check if the data is complete
+    )
 
     def __init__(self, inst: int = NOOP, dst: int = -1, src: int = -1, flag: int = []):
         self.inst = inst
@@ -125,9 +125,7 @@ def __init__(self, init_owner: int, num_gpu: int, cyparray_state: CyPArrayState)
         self._is_complete[CPU_INDEX] = None
 
         self._local_states[init_owner] = self.MODIFIED  # initial state is MODIFIED
-        self.owner = (
-            init_owner
-        )  # the device that has the complete copy (take the role of main memory)
+        self.owner = init_owner  # the device that has the complete copy (take the role of main memory)
         self._versions[init_owner] = 0  # the first version is 0
         self._is_complete[init_owner] = True  # the copy is complete
         self._latest_version = 0  # the latest version in the system
diff --git a/src/python/parla/common/parray/core.py b/src/python/parla/common/parray/core.py
index c4211086..c7dd56ed 100644
--- a/src/python/parla/common/parray/core.py
+++ b/src/python/parla/common/parray/core.py
@@ -170,13 +170,11 @@ def get_array(self, device_idx: Optional[int] = None) -> ndarray:
         """
 
         if device_idx is None:
-            device_idx = self._current_device_index 
+            device_idx = self._current_device_index
 
         if self._slices:  # so this is a sub-parray object
             # index into origin array by saved slices
-            ret = self._array.get_by_global_slices(
-                device_idx, self._slices[0]
-            )
+            ret = self._array.get_by_global_slices(device_idx, self._slices[0])
             for s in self._slices[1:]:
                 ret = ret[s]
             return ret
@@ -214,10 +212,9 @@ def _current_device_index(self) -> int:
             # to avoid import gpu context, which is slow to setup.
             return device.device.id  # device.device should be a cupy.cuda.Device object
 
-
     # Public API:
 
-    def get(self, device: Optional[PyDevice] = None) -> 'np.ndarray' | 'cp.ndarray':
+    def get(self, device: Optional[PyDevice] = None) -> "np.ndarray" | "cp.ndarray":
         if device is None:
             return self.array
         else:
diff --git a/src/python/parla/utility/graphs.py b/src/python/parla/utility/graphs.py
index 072f44f3..d9852dea 100644
--- a/src/python/parla/utility/graphs.py
+++ b/src/python/parla/utility/graphs.py
@@ -251,8 +251,8 @@ class RunConfig:
     """
 
     outer_iterations: int = (
-        1
-    )  # Number of times to launch the Parla runtime and execute the task graph
+        1  # Number of times to launch the Parla runtime and execute the task graph
+    )
     # Number of times to execute the task graph within the same Parla runtime
     inner_iterations: int = 1
     inner_sync: bool = False  # Whether to synchronize after each kernel launch
@@ -504,7 +504,7 @@ def get_task_properties(line: str):
 
 
 def parse_blog(
-    filename: str = "parla.blog"
+    filename: str = "parla.blog",
 ) -> Tuple[Dict[TaskID, TaskTime], Dict[TaskID, List[TaskID]]]:
     try:
         result = subprocess.run(

From f7da831c12f4d544653c9535f374d3669533ae72 Mon Sep 17 00:00:00 2001
From: Hochan Lee <hochan@utexas.edu>
Date: Sun, 14 Jan 2024 16:20:08 -0600
Subject: [PATCH 4/5] remove unncessary file

---
 src/dev.patch | 2498 -------------------------------------------------
 1 file changed, 2498 deletions(-)
 delete mode 100644 src/dev.patch

diff --git a/src/dev.patch b/src/dev.patch
deleted file mode 100644
index 1789036a..00000000
--- a/src/dev.patch
+++ /dev/null
@@ -1,2498 +0,0 @@
-diff --git a/src/c/backend/include/memory_manager.hpp b/src/c/backend/include/memory_manager.hpp
-new file mode 100644
-index 0000000..b1467ec
---- /dev/null
-+++ b/src/c/backend/include/memory_manager.hpp
-@@ -0,0 +1,420 @@
-+#ifndef PARLA_MEMORY_MANNGER_HPP
-+#define PARLA_MEMORY_MANNGER_HPP
-+
-+#include "device_manager.hpp"
-+#include "parray.hpp"
-+
-+
-+/**
-+ * @brief Node type of a PArray eviction double-linked list.
-+ */
-+class PArrayNode {
-+public:
-+  PArrayNode(parray::InnerPArray *parr, size_t prior = 0) :
-+      parray(parr), priority(prior), next(nullptr), prev(nullptr)
-+  {}
-+
-+  /// Pointer of a PArray instance
-+  parray::InnerPArray *parray;
-+  /// Priority of the node
-+  /// TODO(hc): This is not used, but keep it for the future
-+  size_t priority;
-+  /// Pointers to the next and the previous PArrayNodes
-+  PArrayNode *next;
-+  PArrayNode *prev;
-+};
-+
-+/**
-+ * @brief Double-linked list of candidate PArrays for eviction.
-+ * @details PArray eviction manager selects and evicts PArray instances
-+ * in this list depending on an eviction policy.
-+ * Note that an eviction manager manages this list for each device.
-+ */
-+class DoubleLinkedList {
-+public:
-+
-+  /**
-+   * @brief Print the current list.
-+   */
-+  void print() {
-+    PArrayNode *node = this->head_;
-+    std::cout << "\n";
-+    while (node != nullptr) {
-+      std::cout << node->parray->id << " -> \n"; 
-+      node = node->next;
-+    }
-+    std::cout << "\n";
-+  }
-+
-+  /**
-+   * @brief Append a PArray node to the list.
-+   * @detail The first PArray of the list is set to both head and tail, and
-+   * the last added PArray is set to tail.
-+   *
-+   * @param node PArray node to be appended
-+   */
-+  void append(PArrayNode *node) {
-+    this->mtx_.lock();
-+    if (this->list_size_ == 0) {
-+      this->head_ = node;
-+      this->tail_ = node;
-+    } else {
-+      this->tail_->next = node;
-+      node->prev = this->tail_;
-+      this->tail_ = node;
-+    }
-+    this->list_size_ += 1;
-+    this->mtx_.unlock();
-+  }
-+
-+  /**
-+   * @brief Insert a PArray node between `node` and `node->next`.
-+   *
-+   * @param node existing PArray node where `new_node` is being linked
-+   * @param new_node PArray node to be appended after `node`
-+   */
-+  void insert_after(PArrayNode *node, PArrayNode *new_node) {
-+    this->mtx_.lock();
-+    if (node->next != nullptr) {
-+      node->next->prev = new_node;
-+      new_node->next = node->next;
-+    } else {
-+      this->tail_ = new_node;
-+    }
-+    node->next = new_node;
-+    new_node->prev = node;
-+    this->mtx_.unlock();
-+  }
-+
-+  /**
-+   * @brief Insert a PArray node between `node` and `node->prev`.
-+   *
-+   * @param node existing PArray node where `new_node` is being linked
-+   * @param new_node PArray node to be appended before `node`
-+   */
-+  void insert_before(PArrayNode *node, PArrayNode *new_node) {
-+    this->mtx_.lock();
-+    if (node->prev != nullptr) {
-+      node->prev->next = new_node;
-+      new_node->prev = node->prev;
-+    } else {
-+      this->head_ = new_node;
-+    }
-+    node->prev = new_node;
-+    new_node->next = node;
-+    this->mtx_.unlock();
-+  }
-+
-+  /**
-+   * @brief Remove and return the current head PArray node from a list.
-+   */
-+  PArrayNode *remove_head() {
-+    this->mtx_.lock();
-+    PArrayNode *old_head = this->head_;
-+    if (old_head != nullptr) {
-+      this->remove_unsafe(old_head); 
-+    }
-+    this->mtx_.unlock();
-+    return old_head;
-+  }
-+
-+  /**
-+   * @brief Remove a node and return true if it is removed false otherwise.
-+   *
-+   * @param node PArray node to be removed from a list
-+   */
-+  bool remove(PArrayNode *node) {
-+    this->mtx_.lock();
-+    bool rv = this->remove_unsafe(node);
-+    this->mtx_.unlock();
-+    return rv;
-+  }
-+
-+  /**
-+   * @brief Remove a node and return true if it is removed false otherwise.
-+   * This function is not thread safe.
-+   *
-+   * @param node PArray node to be removed from a list
-+   */
-+  bool remove_unsafe(PArrayNode *node) {
-+    if (node->prev == nullptr && node->next == nullptr &&
-+        node != this->head_ && node != this->tail_) {
-+      // If a node is not in a list, do nothing and return false.
-+      return false;
-+    }
-+
-+    if (this->list_size_ == 1) {
-+      // A node is a single node in a list.
-+      this->head_ = this->tail_ = nullptr;
-+    } else {
-+      if (this->head_ == node) {
-+        // A node is a head, and so break link of node->next->prev.
-+        this->head_ = node->next;
-+        node->next->prev = nullptr;
-+      } else if (this->tail_ == node) {
-+        // A node is a tail, and so break link of node->prev->next.
-+        this->tail_ = node->prev;
-+        node->prev->next = nullptr;
-+      } else {
-+        // A node is in the middle of a list, and so break two links.
-+        node->prev->next = node->next;
-+        node->next->prev = node->prev;
-+      }
-+    }
-+    node->prev = node->next = nullptr;
-+    this->list_size_ -= 1;
-+    return true;
-+  }
-+
-+  /**
-+   * @brief Return a size of a list.
-+   */
-+  size_t size() {
-+    this->mtx_.lock();
-+    size_t list_size = this->list_size_;
-+    this->mtx_.unlock();
-+    return list_size;
-+  }
-+
-+  /**
-+   * @brief Return the current head.
-+   * This function is not thread safe.
-+   */
-+  PArrayNode *get_head() {
-+    return this->head_;
-+  }
-+
-+  /**
-+   * @brief Return the current tail.
-+   * This function is not thread safe.
-+   */
-+  PArrayNode *get_tail() {
-+    return this->tail_;
-+  }
-+
-+private:
-+  PArrayNode *head_{nullptr};
-+  PArrayNode *tail_{nullptr};
-+  std::mutex mtx_;
-+  size_t list_size_{0};
-+};
-+
-+
-+/**
-+ * @brief Least-recently-used policy based eviction manager for a device.
-+ * @details It holds PArrays which are not referenced by tasks which are
-+ * between task mapping and termination phases.
-+ */
-+class LRUDeviceEvictionManager {
-+public:
-+  struct PArrayMetaInfo {
-+    // Points to a PArray node if it exists
-+    PArrayNode *parray_node_ptr;
-+    // The number of references to a PArray
-+    size_t ref_count;
-+  };
-+
-+  LRUDeviceEvictionManager(DevID_t dev_id) : dev_id_(dev_id) {}
-+
-+  /**
-+   * @brief A task refers `parray` in the device.
-+   * @detail This function is called when a task being mapped
-+   * refers `parray`. This increases a reference count of the PArray
-+   * and removes it from a zero-referenced list if it exists.
-+   *
-+   * @param parray pointer to a parray to be referred by a task
-+   */
-+  void grab_parray_reference(parray::InnerPArray *parray) {
-+    this->mtx_.lock();
-+    uint64_t parray_id = parray->id;
-+    auto found = this->parray_reference_counts_.find(parray_id);
-+    if (found == this->parray_reference_counts_.end()) {
-+      // Add `parray` to a zr list if it does not exist.
-+      PArrayNode *parray_node = new PArrayNode(parray);
-+      this->parray_reference_counts_[parray_id] =
-+          PArrayMetaInfo{parray_node, 1};
-+    } else {
-+      // If `parray` is already in a zr list, removes it
-+      // from the list and increases its reference count.
-+      found->second.ref_count++; 
-+      this->zr_parray_list_.remove(found->second.parray_node_ptr);
-+    }
-+    this->mtx_.unlock();
-+  }
-+
-+  /**
-+   * @brief A task is finished and releases `parray` in the device.
-+   * @detail This function is called by a worker thread when a task
-+   * assigned to that thread is completed. The thread releases the
-+   * `parray` instance, and decreases its reference count in the device.
-+   * If the reference count becomes 0, the `parray` is added to
-+   * the zero-referenced list.
-+   *
-+   * @param parray pointer to a parray to be released by a task
-+   */
-+  void release_parray_reference(parray::InnerPArray *parray) {
-+    this->mtx_.lock();
-+    uint64_t parray_id = parray->id;
-+    auto found = this->parray_reference_counts_.find(parray_id);
-+    if (found != this->parray_reference_counts_.end()) {
-+      found->second.ref_count--; 
-+      if (found->second.ref_count == 0) {
-+        // If none of task referes to `parray`, add it to
-+        // a zr list.
-+        this->zr_parray_list_.append(found->second.parray_node_ptr);
-+      }
-+    }
-+    this->mtx_.unlock();
-+  }
-+
-+
-+  /**
-+   * @brief Return a size of a list.
-+   */
-+  size_t size() {
-+    size_t zr_parray_list_size{0};
-+    this->mtx_.lock();
-+    zr_parray_list_size = zr_parray_list_.size();
-+    this->mtx_.unlock();
-+    return zr_parray_list_size;
-+  }
-+
-+  /**
-+   * @brief Remove and return a head of the zero-referenced list.
-+   * @detail This function is not thread safe since it assumes that only
-+   * the scheduler thread calls into this function during eviction. 
-+   */
-+  PArrayNode *remove_and_return_head_from_zrlist() {
-+    PArrayNode* old_head{nullptr};
-+    this->mtx_.lock();
-+    old_head = this->zr_parray_list_.remove_head();
-+    this->mtx_.unlock();
-+    return old_head;
-+  }
-+
-+  /**
-+   * @brief This function clears all existing PArrays in the
-+   * zero-referenced list.
-+   * @detail This function has two purposes.
-+   * First, it is used to fix unlinked Python and C++ PArray
-+   * instances. It is possible that Python PArrays are destroyed
-+   * due to, for example, out-of-scope. Then, C++ PArrays
-+   * start to hold invalid Python PArray pointers.
-+   * When a scheduler starts PArray eviction, it is possible that
-+   * the C++ PArrays holding invalid Python PArrays are chosen 
-+   * as evictable PArrays and causes segmentation fault.
-+   * This function removes those PArrays in advance to avoid
-+   * this issue (But users should be aware of and take care of this scenario).
-+   * The second purpose is to allow users to clear all memory
-+   * related states managed by the Parla runtime.
-+   */
-+  // TODO(hc): This bulk flushing is not ideal IMO. The Parla runtime
-+  //           should provide a function that flushes only a single PArray.
-+  //           I am postponing this work since we need to take care of
-+  //           the zero-referenced list, but I have higher priorities.
-+  void clear_all_instances() {
-+    this->mtx_.lock();
-+    PArrayNode* head{nullptr};
-+    do {
-+      head = this->zr_parray_list_.remove_head(); 
-+    } while (head != nullptr);
-+    this->mtx_.unlock();
-+  }
-+
-+private:
-+  /// This eviction manager manages PArray instances in this device
-+  DevID_t dev_id_;
-+  std::mutex mtx_;
-+  /// Key: PArray ID, Value: Meta information including reference
-+  /// count of a PArray
-+  std::unordered_map<uint64_t, PArrayMetaInfo> parray_reference_counts_;
-+  /// A list of zero-referenced PArrays.
-+  DoubleLinkedList zr_parray_list_;
-+};
-+
-+
-+/**
-+ * @brief Least-recently-used policy based global eviction manager.
-+ * @details External components access and manipulate PArray instances in any
-+ * device through this manager.
-+ */
-+class LRUGlobalEvictionManager {
-+public:
-+  LRUGlobalEvictionManager(DeviceManager *device_manager) :
-+    device_manager_(device_manager) {
-+    this->device_mm_.resize(
-+        device_manager->template get_num_devices<DeviceType::All>());
-+    for (size_t i = 0; i < this->device_mm_.size(); ++i) {
-+      this->device_mm_[i] = new LRUDeviceEvictionManager(i);
-+    }
-+  }
-+
-+  /**
-+   * @brief A task refers `parray` in `dev_id` device.
-+   * @detail This function is called when a task being mapped
-+   * refers `parray`. This increases a reference count of the PArray
-+   * and removes it from a zero-referenced list if it exists.
-+   *
-+   * @param parray pointer to a parray to be referred by a task
-+   * @param dev_id device id of a device to access its information
-+   */
-+  void grab_parray_reference(parray::InnerPArray *parray, DevID_t dev_id) {
-+    this->device_mm_[dev_id]->grab_parray_reference(parray);
-+  }
-+
-+  /**
-+   * @brief A task is finished and releases `parray` in `dev_id` device.
-+   * @detail This function is called by a worker thread when a task
-+   * assigned to that completes. So, the thread also releases a
-+   * `parray`, and decreases a reference count of that in the device.
-+   * If the reference count becomes 0, the `parray` is added to
-+   * its zero-referenced list.
-+   *
-+   * @param parray pointer to a parray to be released by a task
-+   * @param dev_id device id of a device to access its information
-+   */
-+  void release_parray_reference(parray::InnerPArray *parray, DevID_t dev_id) {
-+    this->device_mm_[dev_id]->release_parray_reference(parray);
-+  }
-+
-+  /**
-+   * @brief Return a size of a list.
-+   *
-+   * @param dev_id device id of a device to access its information
-+   */
-+  size_t size(DevID_t dev_id) {
-+    return this->device_mm_[dev_id]->size();
-+  }
-+
-+  /**
-+   * @brief Remove and return a head of the zero-referenced list.
-+   * @detail This function is not thread safe since it assumes that only
-+   * the scheduler thread calls into this function during eviction. 
-+   *
-+   * @param dev_id device id of a device to access its information
-+   */
-+  void *remove_and_return_head_from_zrlist(DevID_t dev_id) {
-+    PArrayNode *old_head =
-+        this->device_mm_[dev_id]->remove_and_return_head_from_zrlist();
-+    void *py_parray{nullptr};
-+    if (old_head != nullptr) {
-+      parray::InnerPArray *c_parray = old_head->parray;
-+      py_parray = c_parray->get_py_parray();
-+    }
-+    return py_parray;
-+  }
-+
-+  void clear_all_instances() {
-+    for (size_t i = 0; i < device_mm_.size(); ++i) {
-+      device_mm_[i]->clear_all_instances();
-+    }
-+  }
-+
-+private:
-+  /// Device manager managing system environment
-+  DeviceManager *device_manager_;
-+  /// A list of LRU-based eviction manager for each device
-+  std::vector<LRUDeviceEvictionManager *> device_mm_;
-+};
-+
-+#endif
-diff --git a/src/c/backend/include/parray.hpp b/src/c/backend/include/parray.hpp
-index 68d3de6..aa6fa56 100644
---- a/src/c/backend/include/parray.hpp
-+++ b/src/c/backend/include/parray.hpp
-@@ -30,8 +30,8 @@ enum AccessMode {
-   INOUT = 2,
-   /// Output of a task.
-   OUT = 3,
--  /// Removed PArray (false everywhere).
--  REMOVED = 4,
-+  /// Freed PArray (false everywhere).
-+  FREED = 4,
-   /// Deleted PArray (removed from table).
-   DELETED = 5,
- };
-@@ -59,7 +59,7 @@ public:
-   /// compared to restructuring overheads), but this counter is decreased.
-   /// This is used to provide more accurate PArray placement information
-   /// to the task mapping step.
--  std::vector<CopyableAtomic<size_t>> num_active_tasks;
-+  std::vector<CopyableAtomic<size_t>> num_referring_tasks;
- 
-   InnerPArray() = delete;
-   InnerPArray(void *, uint64_t, uint64_t, InnerPArray *, PArrayState *,
-@@ -92,14 +92,14 @@ public:
-   /// Add a pointer of the task that will use this PArray to the task list
-   void add_task(InnerTask *task);
- 
--  /// Increase the counter for the active tasks that use this PArray.
--  void incr_num_active_tasks(DevID_t global_dev_id);
-+  /// Increase the number of the tasks referring this PArray.
-+  void incr_num_referring_tasks(DevID_t global_dev_id);
- 
--  /// Decrease the counter for the active tasks that use this PArray.
--  void decr_num_active_tasks(DevID_t global_dev_id);
-+  /// Decrease the number of the tasks referring this PArray.
-+  void decr_num_referring_tasks(DevID_t global_dev_id);
- 
--  /// Get the number of counter for the active tasks that use this PArray.
--  size_t get_num_active_tasks(DevID_t global_dev_id);
-+  /// Get the number of the tasks referring this PArray.
-+  size_t get_num_referring_tasks(DevID_t global_dev_id);
- 
-   // TODO(hc): I will replace this list with a concurrent map.
-   /// Get a reference to a list of tasks who are using this PArray
-@@ -136,8 +136,8 @@ private:
-   //           I will use this map: https://github.com/greg7mdp/parallel-hashmap
-   //           I have used this for a while and it is good.
-   TaskList _task_lists;
--
-+  /// Python PArray address
-   void *_py_parray;
- };
- 
--} // namespace parray
-\ No newline at end of file
-+} // namespace parray
-diff --git a/src/c/backend/include/parray_tracker.hpp b/src/c/backend/include/parray_tracker.hpp
-index b49c62f..c46141b 100644
---- a/src/c/backend/include/parray_tracker.hpp
-+++ b/src/c/backend/include/parray_tracker.hpp
-@@ -13,6 +13,10 @@
- 
- using namespace parray;
- 
-+/**
-+ * @brief PArray tracker that tracks PArray mapping state.
-+ * Note that this does not track slice PArrays, but a complete PArray.
-+ */
- class PArrayTracker {
- public:
-   PArrayTracker(size_t num_devices) : num_devices_(num_devices) {
-@@ -204,7 +208,7 @@ protected:
- 
-   /// Any worker thread can update states of PArrays.
-   /// Guard operations by this lock.
--  /// TODO(hc): This will be replaced with parallel hash map.
-+  /// TODO(hc): This will be replaced with parallel hash map (phmap).
-   std::mutex mtx;
- };
- 
-diff --git a/src/c/backend/include/runtime.hpp b/src/c/backend/include/runtime.hpp
-index f38cfac..17f72b3 100644
---- a/src/c/backend/include/runtime.hpp
-+++ b/src/c/backend/include/runtime.hpp
-@@ -44,6 +44,7 @@
- #include "parray_tracker.hpp"
- #include "profiling.hpp"
- #include "resource_requirements.hpp"
-+#include "memory_manager.hpp"
- 
- using namespace std::chrono_literals;
- using namespace parray;
-@@ -985,9 +986,26 @@ public:
-   /* Should Run, Stop Condition */
-   std::atomic<bool> should_run = true;
- 
-+  /* Clear all Python PArray references if users request */ 
-+  std::atomic<bool> clear_all_pyparrays = false;
-+  /* Clear all C PArray objects if users request in eviction manager */ 
-+  std::atomic<bool> clear_all_cparrays = false;
-+
-   /* Phase: maps tasks to devices */
-   Mapper *mapper;
- 
-+  /* If it is set, break an infinite loop in InnerScheduler::run()
-+     and invoke PArray eviction from PythonScheduler */
-+  bool break_for_eviction = false;
-+  /* Memory size to evict for each device */
-+  std::vector<size_t> memory_size_to_evict{0};
-+
-+  /* Set necessary memory size getting from eviction manager
-+     on each device */
-+  void set_memory_size_to_evict(size_t, DevID_t);
-+  /* Get memory size to evict for each device */
-+  size_t get_memory_size_to_evict(DevID_t);
-+
-   /* Phase reserves resources to limit/plan task execution*/
-   MemoryReserver *memory_reserver;
-   RuntimeReserver *runtime_reserver;
-@@ -995,7 +1013,7 @@ public:
-   /*Responsible for launching a task. Signals worker thread*/
-   Launcher *launcher;
- 
--  InnerScheduler(DeviceManager *device_manager);
-+  InnerScheduler(LRUGlobalEvictionManager *memory_manager, DeviceManager *device_manager);
-   ~InnerScheduler();
-   // InnerScheduler(int nworkers);
- 
-@@ -1017,6 +1035,9 @@ public:
-   /* Set Python "stop" callback */
-   void set_stop_callback(stopfunc_t stop_callback);
- 
-+  /* Get a flag that represents if there is still a task to be executed */
-+  bool get_should_run();
-+
-   /* Run the scheduler thread. Active for the lifetime of the Parla program */
-   void run();
- 
-@@ -1087,6 +1108,20 @@ public:
-   /* Release a PArray in a device */
-   void remove_parray(InnerPArray *parray, DevID_t global_dev_id);
- 
-+  /* Reflect a PArray removal to PArray trackers */
-+  void remove_parray_from_tracker(
-+      parray::InnerPArray *parray, DevID_t global_dev_id);
-+
-+  void grab_parray_reference(
-+      parray::InnerPArray *parray, DevID_t global_dev_id) {
-+    this->mm_->grab_parray_reference(parray, global_dev_id);
-+  }
-+
-+  void release_parray_reference(
-+      parray::InnerPArray *parray, DevID_t global_dev_id) {
-+    this->mm_->release_parray_reference(parray, global_dev_id);
-+  }
-+
-   /* Get mapped memory on device */
-   size_t get_mapped_memory(DevID_t global_dev_id);
- 
-@@ -1108,10 +1143,17 @@ public:
- 
-   DeviceManager *get_device_manager() { return this->device_manager_; }
- 
-+  /* Invoke clearing all C PArray instances from a PArray eviction manager. */
-+  void invoke_all_cparrays_clear();
-+  
-+  /* Get a flag that represents whether Python PArrays should be cleared */
-+  bool get_all_pyparrays_clear_flag();
- protected:
-   /// It manages all device instances in C++.
-   /// This is destructed by the Cython scheduler.
-   DeviceManager *device_manager_;
-+
-+  LRUGlobalEvictionManager *mm_;
- };
- 
- #endif // PARLA_BACKEND_HPP
-diff --git a/src/c/backend/memory_manager.cpp b/src/c/backend/memory_manager.cpp
-new file mode 100644
-index 0000000..f829294
---- /dev/null
-+++ b/src/c/backend/memory_manager.cpp
-@@ -0,0 +1 @@
-+#include "include/memory_manager.hpp"
-diff --git a/src/c/backend/parray.cpp b/src/c/backend/parray.cpp
-index 5e555d4..20cb46a 100644
---- a/src/c/backend/parray.cpp
-+++ b/src/c/backend/parray.cpp
-@@ -9,7 +9,7 @@ InnerPArray::InnerPArray(void *py_parray, uint64_t id, uint64_t parent_id,
-                          DevID_t num_devices)
-     : _py_parray(py_parray), id(id), parent_id(parent_id),
-       _parent_parray(parent_parray), _state(state), _num_devices(num_devices) {
--  num_active_tasks.resize(num_devices);
-+  num_referring_tasks.resize(num_devices);
- }
- 
- const uint64_t InnerPArray::get_size() const { return this->_size; }
-@@ -29,32 +29,32 @@ void InnerPArray::add_task(InnerTask *task) {
-   this->_task_lists.push_back(task);
- }
- 
--void InnerPArray::incr_num_active_tasks(DevID_t global_dev_id) {
-+void InnerPArray::incr_num_referring_tasks(DevID_t global_dev_id) {
-   if (this->_parent_parray != nullptr) {
--    this->_parent_parray->num_active_tasks[global_dev_id].fetch_add(
-+    this->_parent_parray->num_referring_tasks[global_dev_id].fetch_add(
-         1, std::memory_order_relaxed);
-   } else {
--    this->num_active_tasks[global_dev_id].fetch_add(1,
-+    this->num_referring_tasks[global_dev_id].fetch_add(1,
-                                                     std::memory_order_relaxed);
-   }
- }
- 
--void InnerPArray::decr_num_active_tasks(DevID_t global_dev_id) {
-+void InnerPArray::decr_num_referring_tasks(DevID_t global_dev_id) {
-   if (this->_parent_parray != nullptr) {
--    this->_parent_parray->num_active_tasks[global_dev_id].fetch_sub(
-+    this->_parent_parray->num_referring_tasks[global_dev_id].fetch_sub(
-         1, std::memory_order_relaxed);
-   } else {
--    this->num_active_tasks[global_dev_id].fetch_sub(1,
-+    this->num_referring_tasks[global_dev_id].fetch_sub(1,
-                                                     std::memory_order_relaxed);
-   }
- }
- 
--size_t InnerPArray::get_num_active_tasks(DevID_t global_dev_id) {
-+size_t InnerPArray::get_num_referring_tasks(DevID_t global_dev_id) {
-   if (this->_parent_parray != nullptr) {
--    return this->_parent_parray->num_active_tasks[global_dev_id].load(
-+    return this->_parent_parray->num_referring_tasks[global_dev_id].load(
-         std::memory_order_relaxed);
-   } else {
--    return this->num_active_tasks[global_dev_id].load(
-+    return this->num_referring_tasks[global_dev_id].load(
-         std::memory_order_relaxed);
-   }
- }
-diff --git a/src/c/backend/parray_tracker.cpp b/src/c/backend/parray_tracker.cpp
-index ac5e88d..e5226d7 100644
---- a/src/c/backend/parray_tracker.cpp
-+++ b/src/c/backend/parray_tracker.cpp
-@@ -12,7 +12,6 @@ size_t PArrayTracker::do_parray_creation_(AccessMode access_mode,
-   if (access_mode >= AccessMode::OUT || is_tracked) {
-     return 0;
-   } else {
--
-     size_t to_move = parray->get_size();
- 
-     // std::cout << "PArrayTracker::do_parray_creation " << std::endl;
-@@ -65,8 +64,8 @@ size_t PArrayTracker::do_parray_removal_(AccessMode access_mode, DevID_t dev_id,
-       this->set_parray_unsafe(i, parray_id, false);
-     }
- 
--    if (access_mode != AccessMode::REMOVED) {
--      this->set_parray_unsafe(dev_id, parray_id, false);
-+    if (access_mode != AccessMode::FREED) {
-+      this->set_parray_unsafe(dev_id, parray_id, true);
-     }
- 
-     if (is_slice) {
-@@ -92,7 +91,6 @@ size_t PArrayTracker::do_parray_removal_(AccessMode access_mode, DevID_t dev_id,
-         }
-       }
-       */
--
-     } else {
-       // If the PArray is not a slice, it is a parent
-       // invalidate all its children on all devices except the target
-@@ -104,7 +102,7 @@ size_t PArrayTracker::do_parray_removal_(AccessMode access_mode, DevID_t dev_id,
-           this->set_parray_unsafe(i, child_id, false);
-         }
- 
--        if (access_mode != AccessMode::REMOVED) {
-+        if (access_mode != AccessMode::FREED) {
-           this->set_parray_unsafe(dev_id, child_id, false);
-         }
-       }
-@@ -217,4 +215,4 @@ size_t PArrayTracker::check_log(const DevID_t dev_id,
-   // std::cout << "Checked parray removal: " << to_move << std::endl;
- 
-   return to_move;
--}
-\ No newline at end of file
-+}
-diff --git a/src/c/backend/phases.cpp b/src/c/backend/phases.cpp
-index 79eef62..6975c82 100644
---- a/src/c/backend/phases.cpp
-+++ b/src/c/backend/phases.cpp
-@@ -93,7 +93,7 @@ void Mapper::map_task(InnerTask *task, DeviceRequirementList &chosen_devices) {
-       mapped_pool.increase<Resource::Memory>(mapped_size);
- 
-       InnerPArray *parray = parray_access.first;
--      parray->incr_num_active_tasks(global_dev_id);
-+      parray->incr_num_referring_tasks(global_dev_id);
-     }
-   }
- 
-@@ -132,6 +132,14 @@ void Mapper::run(SchedulerPhase *next_phase) {
-   this->drain_parray_buffer();
- 
-   // TODO Fix Issue #108
-+  /** XXX(hc): revisit this
-+  // Accumulate necessary memory size for PArrays being mapped.
-+  std::vector<size_t> accum_necessary_memory;
-+  // Eviction manager does not evict CPU instances; In the future,
-+  // this also might be evicted to other devices. But to simplify indexing,
-+  // allocate as the number of total devices.
-+  accum_necessary_memory.resize(this->device_manager->get_num_devices(DeviceType::All));
-+  ***/
-   while (has_task && num_task_mapping_attempt < 20) {
- 
-     this->drain_parray_buffer();
-@@ -277,6 +285,9 @@ bool MemoryReserver::check_data_resources(InnerTask *task) {
-       InnerPArray *parray = parray_access.first;
-       AccessMode access_mode = parray_access.second;
- 
-+      // Register this PArray to eviction manager's table
-+      this->scheduler->grab_parray_reference(parray, local_device_idx);
-+
-       // If the PArray is not an input, then we don't need to check size
-       // Note(@dialecticDolt):
-       // There is literally no such thing as an out type in our syntax why do we
-@@ -305,6 +316,10 @@ bool MemoryReserver::check_data_resources(InnerTask *task) {
- 
-     status = status && device_status;
-     if (!status) {
-+      // If a device has not enough memory, activate eviction manager
-+      this->scheduler->set_memory_size_to_evict(
-+          size_on_device * 2, local_device_idx);
-+      this->scheduler->break_for_eviction = true;
-       break;
-     }
-   }
-diff --git a/src/c/backend/policy.cpp b/src/c/backend/policy.cpp
-index 1cf1c76..89a85fb 100644
---- a/src/c/backend/policy.cpp
-+++ b/src/c/backend/policy.cpp
-@@ -12,6 +12,8 @@ bool LocalityLoadBalancingMappingPolicy::calc_score_devplacement(
-   // std::cout << "[Locality-aware- and Load-balancing mapping policy]\n";
- 
-   // Check device resource availability.
-+  // Note that this counter is tracking 'launched' or 'reserved' resource
-+  // status.
-   if (!device.check_resource_availability(dev_placement_req.get())) {
-     // std::cout << "Device resource failure!" << std::endl;
-     return false;
-diff --git a/src/c/backend/scheduler.cpp b/src/c/backend/scheduler.cpp
-index a6fadfc..95152aa 100644
---- a/src/c/backend/scheduler.cpp
-+++ b/src/c/backend/scheduler.cpp
-@@ -126,8 +126,18 @@ template class WorkerPool<WorkerQueue, WorkerQueue>;
- 
- // Scheduler Implementation
- 
--InnerScheduler::InnerScheduler(DeviceManager *device_manager)
--    : device_manager_(device_manager) {
-+InnerScheduler::InnerScheduler(LRUGlobalEvictionManager* memory_manager,
-+                               DeviceManager *device_manager)
-+    : device_manager_(device_manager), mm_(memory_manager) {
-+
-+  // For now, it does not evict PArrays on CPU memory.
-+  this->memory_size_to_evict.resize(
-+      device_manager->template get_num_devices<DeviceType::All>());
-+
-+  // A dummy task count is used to keep the scheduler alive.
-+  // NOTE: At least one task must be added to the scheduler by the main thread,
-+  // otherwise the runtime will finish immediately
-+  // this->increase_num_active_tasks();
- 
-   this->workers.set_num_workers(1);
- 
-@@ -157,31 +167,70 @@ void InnerScheduler::set_stop_callback(stopfunc_t stop_callback) {
-   this->stop_callback = stop_callback;
- }
- 
-+bool InnerScheduler::get_should_run() {
-+  return this->should_run.load();
-+}
-+
-+bool InnerScheduler::get_all_pyparrays_clear_flag() {
-+  return this->clear_all_pyparrays.load();
-+}
-+
-+void InnerScheduler::set_memory_size_to_evict(
-+    size_t size, DevID_t dev_id) {
-+  this->memory_size_to_evict[dev_id] = size;
-+}
-+
-+size_t InnerScheduler::get_memory_size_to_evict(DevID_t dev_id) {
-+  return this->memory_size_to_evict[dev_id];
-+}
-+
- void InnerScheduler::run() {
-   NVTX_RANGE("Scheduler::run", NVTX_COLOR_RED)
--  unsigned long long iteration_count = 0;
-+  this->clear_all_cparrays = false;
-+  this->clear_all_pyparrays = false;
-   while (this->should_run.load()) {
-+    this->break_for_eviction = false;
-     auto status = this->activate();
-     if (this->sleep_flag) {
-       std::this_thread::sleep_for(std::chrono::milliseconds(this->sleep_time));
-     }
-+    if (this->break_for_eviction) {
-+      // Yield a control to a Python scheduler to evict PArrays since
-+      // PArray coherency protocol is managed at there.
-+      break;
-+    }
-+    if (this->clear_all_cparrays.load()) {
-+      // TODO(hc): This should be more generalized and refined.
-+      // Temporarily use it as experimental puprose.
-+      std::cout << "Clear all C/Python parrays..\n";
-+      this->mm_->clear_all_instances();
-+      this->clear_all_pyparrays = true;
-+      break;
-+    }
-   }
- }
- 
- void InnerScheduler::stop() {
-   LOG_INFO(SCHEDULER, "Stopping scheduler");
-   this->should_run = false;
--  launch_stop_callback(this->stop_callback, this->py_scheduler);
-+  // XXX(hc): To process PArray eviction on Python,
-+  // Python scheduler now has an while loop that iterates until there is
-+  // no more task, and it wraps C scheduler's loop.
-+  // Therefore, there is no point for C++ scheduler to explicitly invoke
-+  // this callback at here. Python scheduler knows when it needs to stop.
-+  //launch_stop_callback(this->stop_callback, this->py_scheduler);
-   LOG_INFO(SCHEDULER, "Stopped scheduler");
- }
- 
-+void InnerScheduler::invoke_all_cparrays_clear() {
-+  this->clear_all_cparrays = true; 
-+}
-+
- Scheduler::Status InnerScheduler::activate() {
-   // std::cout<< "Scheduler Activated" << std::endl;
--
-   this->mapper->run(this->memory_reserver);
-   this->memory_reserver->run(this->runtime_reserver);
-   this->runtime_reserver->run(this->launcher);
--
-   // LOG_TRACE(SCHEDULER, "ReadyPhase Status: {}", this->runtime_reserver);
-   return this->status;
- }
-@@ -289,6 +338,15 @@ void InnerScheduler::remove_parray(InnerPArray *parray, DevID_t global_dev_id) {
-   // Could also be to call DELETED or REMOVED status on do_log
- }
- 
-+void InnerScheduler::remove_parray_from_tracker(
-+    parray::InnerPArray *parray, DevID_t global_dev_id) {
-+  AccessMode access_mode = AccessMode::FREED;
-+  this->mapper->get_parray_tracker()->do_log(global_dev_id,
-+      std::make_pair(parray, access_mode));
-+  this->memory_reserver->get_parray_tracker()->do_log(global_dev_id,
-+      std::make_pair(parray, access_mode));
-+}
-+
- size_t InnerScheduler::get_mapped_memory(DevID_t global_dev_idx) {
-   Device *device =
-       this->device_manager_->get_device_by_global_id(global_dev_idx);
-@@ -362,7 +420,12 @@ void InnerScheduler::task_cleanup_postsync(InnerWorker *worker, InnerTask *task,
-       for (size_t j = 0; j < parray_access_list.size(); ++j) {
-         auto &parray_access = parray_access_list[j];
-         InnerPArray *parray = parray_access.first;
--        parray->decr_num_active_tasks(dev_id);
-+        parray->decr_num_referring_tasks(dev_id);
-+        // Decrease this PArray's reference count.
-+        // If this becomes 0, this instance will be release
-+        // when the PArray coherency protocol updates it
-+        // to eviction state.
-+        this->release_parray_reference(parray, dev_id);
-       }
-     }
- 
-diff --git a/src/python/parla/__init__.py b/src/python/parla/__init__.py
-index ba53b56..49ca47a 100644
---- a/src/python/parla/__init__.py
-+++ b/src/python/parla/__init__.py
-@@ -7,6 +7,7 @@ from .cython import core
- from .cython import device_manager
- from .cython import device
- from .cython import variants
-+from .cython import mm
- from .common.spawn import spawn
- from .common import parray
- 
-@@ -22,6 +23,7 @@ TaskSpace = tasks.TaskSpace
- Tasks = tasks.TaskCollection
- 
- DeviceManager = device_manager.PyDeviceManager
-+PyMM = mm.PyMM
- 
- Stream = device.Stream
- create_env = tasks.create_env
-@@ -59,6 +61,7 @@ class Parla:
-         self.sig = sig_type
-         self.handle_interrupt = True
-         self._device_manager = DeviceManager(dev_config_file)
-+        self._memory_manager = PyMM(self._device_manager)
- 
-         if logfile is None:
-             logfile = os.environ.get("PARLA_LOGFILE", None)
-@@ -77,7 +80,9 @@ class Parla:
-         if hasattr(self, "_sched"):
-             raise ValueError(
-                 "Do not use the same Parla object more than once.")
--        self._sched = self.scheduler_class(self._device_manager, **self.kwds)
-+        self._sched = self.scheduler_class(self._memory_manager,
-+                                           self._device_manager,
-+                                           **self.kwds)
- 
-         self.interuppted = False
-         self.released = False
-diff --git a/src/python/parla/common/parray/@ b/src/python/parla/common/parray/@
-new file mode 100644
-index 0000000..af841d1
---- /dev/null
-+++ b/src/python/parla/common/parray/@
-@@ -0,0 +1,541 @@
-+from __future__ import annotations
-+from typing import TYPE_CHECKING, Union, List, Dict, Tuple, Any
-+
-+import numpy
-+import ctypes
-+
-+#TODO: Fix this to be more stable and less of a hack.
-+try:
-+    import cupy
-+except (ImportError, AttributeError):
-+    import numpy as cupy
-+
-+from .coherence import CPU_INDEX
-+
-+if TYPE_CHECKING:  # False at runtime
-+    import cupy
-+    ndarray = Union[numpy.ndarray, cupy.ndarray]
-+    SlicesType = Union[slice, int, tuple]
-+    IndicesMapType = List[Union[Dict[int, int], tuple]]
-+    from parla.cython.cyparray_state import CyPArrayState
-+
-+class MultiDeviceBuffer:
-+    """Underlying Buffer of PArray.
-+
-+    It holds per device array copy and also index mapping.
-+    """
-+
-+    _buffer: Dict[int, ndarray | List[ndarray] | None]
-+    shape: tuple
-+    _indices_map: Dict[int, List[IndicesMapType] | None]
-+    _cyparray_state: CyPArrayState
-+
-+    def __init__(self, num_gpu: int, cyparray_state: CyPArrayState):
-+        # per device buffer
-+        # key: device_id
-+        # val: single (complete) ndarray or list of (sub) ndarray
-+        self._buffer = {n: None for n in range(num_gpu)}  # add gpu id
-+        self._buffer[CPU_INDEX] = None  # add cpu id
-+
-+        # per device indices mapping
-+        # key: device_id
-+        # val: list of {global_index: local_index} and tuple(begin, end, stop), and the tuple is a represent of slice(begin, end, stop)
-+        self._indices_map = {n: None for n in range(num_gpu)}
-+        self._indices_map[CPU_INDEX] = None
-+
-+        # the shape of the complete array
-+        self.shape = ()
-+
-+        self._cyparray_state = cyparray_state
-+
-+    def nbytes_at(self, device_id:int) -> int:
-+        """
-+        Return the buffer size at `device_id`
-+        """
-+        buffer = self._buffer[device_id]
-+        if buffer is None:
-+            return 0
-+        elif isinstance(buffer, list): # subarray at this device buffer
-+            # size is the sum
-+            nbytes = 0
-+            for subarray in buffer:
-+                nbytes += subarray.nbytes
-+            return nbytes
-+        else:  # complete array
-+            return buffer.nbytes
-+
-+    def set_complete_array(self, array: ndarray) -> int:
-+        """
-+        Add array into the buffer (based on array's device).
-+
-+        Args:
-+            array: :class:`cupy.ndarray` or :class:`numpy.array` object
-+
-+        Return:
-+            a location (device_id) of the array
-+        """
-+        # get the array's location
-+        if isinstance(array, numpy.ndarray):
-+            location = CPU_INDEX
-+        else:
-+            location = int(array.device)
-+
-+        self._buffer[location] = array
-+        self.shape = array.shape
-+        self._cyparray_state.set_exist_on_device(location, True)
-+        return location
-+
-+    def set(self, device_id: int, array: ndarray, is_complete: bool = True, overwrite: bool = False) -> None:
-+        """
-+        Set copy at a device, also clean up existing `indices_map` if necessary
-+
-+        Args:
-+            device_id: gpu device_id or CPU_INDEX
-+            array: :class:`cupy.ndarray` or :class:`numpy.array` object
-+            is_complete: True if `array` is a complete copy, otherwise `array` is a subarray
-+            overwrite: True if need to clean other subarray copy inside the device before assign the new array
-+        """
-+        if is_complete:
-+            self._indices_map[device_id] = None
-+            self._buffer[device_id] = array
-+        else:
-+            if not isinstance(self._buffer[device_id], List) or overwrite:
-+                self._indices_map[device_id] = None
-+                self._buffer[device_id] = [array]
-+            else:
-+                self._buffer[device_id].append(array)
-+        self._cyparray_state.set_exist_on_device(device_id, True)
-+
-+    def get(self, device_id: int) -> ndarray | List[ndarray] | None:
-+        """
-+        Return the copy at a device
-+
-+        Args:
-+            device_id: gpu device_id or CPU_INDEX
-+
-+        Return
-+            :class:`cupy.ndarray` or :class:`numpy.array` object
-+        """
-+        return self._buffer[device_id]
-+
-+    def get_global_slices(self, device_id:int, subarray_index:int) -> SlicesType | None:
-+        """
-+        Return global slices of one copy at the device.
-+
-+        If the copy is complete, return None
-+        """
-+        if self._indices_map[device_id] is None:
-+            return None
-+        else:
-+            slices = []
-+            for device_indices in self._indices_map[device_id][subarray_index]:
-+                if isinstance(device_indices, dict):
-+                    index = list(device_indices.keys())
-+                    if len(index) == 1:
-+                        slices.append(index[0])
-+                    else:
-+                        slices.append(index)
-+                else:
-+                    slices.append(slice(*device_indices))
-+
-+            return tuple(slices)
-+
-+    @staticmethod
-+    def _map_int_with_int_map(n: int, int_map: Dict[int, int]) -> int | None:
-+        """
-+        Find the mapping of `n` in `int_map`
-+
-+        if `n` not in `int_map`, return None
-+
-+        example:
-+            n: 2
-+            int_map: {1:0, 2:1}
-+            return: 1
-+        """
-+        return None if n not in int_map else int_map[n]
-+
-+    @staticmethod
-+    def _map_int_with_slice(n: int, target_slice: tuple) -> int | None:
-+        """
-+        Find the mapping of `n` in a `target_slice` (find index of `n` in `target_slice`)
-+        `target_slice` is a tuple(begin, end, step)
-+
-+        if `n` not in `target_slice`, return None
-+
-+        example:
-+            n: 2
-+            target_slice: (2, 4, 1)
-+            return: 0
-+        """
-+        # TODO: assume slice is simple (no neg value)
-+        begin, end, step = target_slice
-+        step = 1 if step is None else step
-+
-+        # bound checking
-+        if n < begin or n >= end:
-+            return None
-+        if (n - begin) % step != 0:
-+            return None
-+
-+        return (n - begin) // step
-+
-+    @staticmethod
-+    def _map_slice_with_slice(input_slice: tuple, target_slice: tuple) -> tuple | None:
-+        """
-+        Find the mapping of `input_slice` in a `target_slice`
-+        `input_slice` and `target_slice` is a tuple(begin, end, step)
-+
-+        if `input_slice` not a subset of `target_slice`, return None
-+
-+        example:
-+            input_slice: (2, 10, 4)
-+            target_slice: (0, 10, 2)
-+            return: (1, 5, 2)
-+        """
-+        # TODO: assume slice is simple (no neg value)
-+        target_begin, target_end, target_step = target_slice
-+        target_step = 1 if target_step is None else target_step
-+
-+        input_begin, input_end, input_step = input_slice
-+        input_step = 1 if input_step is None else input_step
-+
-+        mapped_begin = MultiDeviceBuffer._map_int_with_slice(
-+            input_begin, target_slice)
-+
-+        # get the last possible element in range of `input_slice`
-+        # TODO: what if last_element < input_begin ?
-+        last_element = input_end - input_step + (input_end - input_begin) % input_step
-+        mapped_end = MultiDeviceBuffer._map_int_with_slice(last_element, target_slice)
-+
-+        if mapped_begin is None or mapped_end is None:
-+            return None
-+
-+        # adjust step
-+        if input_step % target_step != 0:
-+            return None
-+        mapped_step = input_step // target_step
-+
-+        return mapped_begin, mapped_end + 1, mapped_step  # tuple
-+
-+    def map_local_slices(self, device_id: int, global_slices: SlicesType) -> (int, SlicesType):
-+        """
-+        Map a given global slices to local slices wrt buffer at the device.
-+
-+        Raise error if `global_slices` out of range
-+
-+        Return subarray_index: the index of subarray in the list of `_buffer[device_id]`
-+               local_slices: the local slices which maps to the `global_slices`
-+        Note: this method assume a indices mapping exists for this device
-+        """
-+        # indexing into the whole array, index of out bound
-+        not_tuple = False
-+        if not isinstance(global_slices, tuple):  # if not a tuple, make it a tuple
-+            global_slices = tuple([global_slices])
-+            not_tuple = True
-+
-+        local_slices = []
-+
-+        if len(self.shape) < len(global_slices):
-+            raise IndexError(f"index out of range, index:{global_slices}")
-+
-+        final_subarray_index = 0
-+
-+        for subarray_index in range(len(self._indices_map[device_id])):  # for each subarray at this device
-+            indices_map = self._indices_map[device_id][subarray_index]
-+
-+            for d in range(len(global_slices)):
-+                size = self.shape[d]  # number of entries at this axis
-+                global_index = global_slices[d]
-+                index_map = None if d >= len(indices_map) else indices_map[d]
-+
-+                if index_map is None:  # None means 1:1 map to all elements at this axis
-+                    local_index = global_index
-+                elif isinstance(index_map, dict) and len(index_map) == 1:
-+                    # special case, this axis was indexed by a int, so
-+                    # dimension was reduced by 1,
-+                    # need to ignore this axis, just check index match or not
-+                    if list(index_map.keys())[0] == global_index:  # false if type or value doesn't match
-+                        continue
-+                    else:
-+                        local_index = None
-+                elif isinstance(index_map, tuple):
-+                    if isinstance(global_index, int):  # int vs slice
-+                        local_index = MultiDeviceBuffer._map_int_with_slice(global_index, index_map)
-+                    elif isinstance(global_index, list):  # List[int] vs slice
-+                        local_index = [MultiDeviceBuffer._map_int_with_slice(i, index_map) for i in global_index]
-+
-+                        # any index out of bound?
-+                        if None in local_index:
-+                            local_index = None
-+                    elif isinstance(global_index, slice):  # slice vs slice
-+                        # slice to tuple
-+                        slice_tuple = global_index.indices(size)
-+                        local_tuple = MultiDeviceBuffer._map_slice_with_slice(slice_tuple, index_map)
-+                        if local_tuple is None:
-+                            local_index = None
-+                        else:
-+                            local_index = slice(*local_tuple)
-+                    else:
-+                        raise IndexError(f"Unsupported slices type: {type(global_index)}")
-+                else:  # Map is int or list<int>
-+                    if isinstance(global_index, int):  # int vs int/list
-+                        local_index = self._map_int_with_int_map(global_index, index_map)
-+                    elif isinstance(global_index, list):  # list vs int/list
-+                        local_index = [self._map_int_with_int_map(i, index_map) for i in global_index]
-+
-+                        if None in local_index:
-+                            local_index = None
-+                    elif isinstance(global_index, slice):  # slice vs int/list
-+                        # slice to tuple
-+                        slice_tuple = global_index.indices(size)
-+                        local_index = [self._map_int_with_int_map(i, index_map) for i in range(*slice_tuple)]
-+
-+                        if None in local_index:
-+                            local_index = None
-+                    else:
-+                        raise IndexError(f"Unsupported slices type {type(global_index)}")
-+
-+                # if None, it means index out of range at this axis
-+                if local_index is None:
-+                    # check next copy
-+                    local_slices = None
-+                    break
-+
-+                local_slices.append(local_index)
-+
-+            if local_slices is None:  # result is not found for this subarray
-+                if subarray_index == len(self._indices_map[device_id]) - 1:  # this is the last subarray
-+                    local_slices = None  # non slices is found
-+                else: # check next subarray
-+                    local_slices = []  # clear intermidate result
-+            else:
-+                final_subarray_index = subarray_index
-+                break
-+
-+        if local_slices is None:
-+            raise IndexError(f"index out of range, index:{global_slices}")
-+        elif not_tuple:
-+            if len(local_slices) == 0:  # only be possible when special case int vs int exists and all axis are ignored
-+                return final_subarray_index, slice(None, None, None)
-+            else:
-+                return final_subarray_index, local_slices[0]
-+        else:
-+            return final_subarray_index, tuple(local_slices)
-+
-+    def set_slices_mapping(self, device_id: int, global_slices: SlicesType):
-+        """
-+        set a global slices to local slices mapping wrt buffer at the device.
-+
-+        Raise error if `global_slices` is higher dim than shape
-+        Note: this call doesn't check slice is within range, if it is not in range
-+              exception will be trigger later when trying to index into the copy
-+        """
-+        if not isinstance(global_slices, tuple):  # if not a tuple, make it a tuple
-+            global_slices = tuple([global_slices])
-+
-+        if len(self.shape) < len(global_slices):
-+            raise IndexError(f"index out of range, index:{global_slices}")
-+
-+        slices_map_list = []
-+        for d in range(len(global_slices)):
-+            size = self.shape[d]  # number of entries at this axis
-+            global_slice = global_slices[d]
-+
-+            if isinstance(global_slice, int):  # a single integer
-+                slice_map = {global_slice: 0}
-+            elif isinstance(global_slice, list):  # a list of integer
-+                slice_map = {global_slice[i]: i for i in range(len(global_slice))}
-+            elif isinstance(global_slice, slice):  # slice
-+                # save slice as a tuple
-+                # None in slice will be instantiated by concrete values
-+                slice_map = global_slice.indices(size)
-+            else:
-+                raise IndexError(f"Unsupported slices type {type(global_slice)}")
-+            slices_map_list.append(slice_map)
-+
-+        if self._indices_map[device_id] is None:
-+            self._indices_map[device_id] = [slices_map_list]
-+        else:
-+            self._indices_map[device_id].append(slices_map_list)
-+
-+    def get_by_global_slices(self, device_id: int, global_slices: SlicesType):
-+        """
-+        Indexing/Slicing the buffer by `global_slices`.
-+
-+        `global_slices` will be first converted into local slices
-+
-+        Args:
-+            device_id: gpu device_id or CPU_INDEX
-+            global_slices: slice/ints/tuple/list<int>, use the same format as advance indexing of numpy
-+
-+        Return
-+            :class:`cupy.ndarray` or :class:`numpy.array` object or `None` if there is no copy at that device
-+        """
-+        # check if a copy exists at this device
-+        #
-+        # This is needed for the usage of @spawn() annotation
-+        # sometimes device has no copy but still need a parray slices view object in advance
-+        # Example:
-+        # A = parray(numpy_array)
-+        # @spawn(inout=[A], placement=gpu(0))
-+        # def task1():
-+        #   ... a task move A to GPU. free CPU copy ...
-+        # @spawn(inout=[A[0]], placement=gpu(1))
-+        # def task2():
-+        #   error: cpu's copy is freed -> `inout=[A[0]]` trigger an exception,
-+        #   since it try a slice A at CPU (spawn() itself is happened at outer CPU task)
-+        if self._buffer[device_id] is None:
-+            return None
-+
-+        # check if there is a mapping
-+        if self._indices_map[device_id] is None:
-+            return self._buffer[device_id].__getitem__(global_slices)
-+        else:
-+            # map global slices to local slices
-+            subarray_index, local_slices = self.map_local_slices(device_id, global_slices)
-+            return self._buffer[device_id][subarray_index].__getitem__(local_slices)
-+
-+    def set_by_global_slices(self, device_id: int, global_slices: SlicesType, value: ndarray | Any):
-+        """
-+        Indexing/Slicing the buffer by `global_slices` and set value.
-+
-+        `global_slices` will be first converted into local slices
-+
-+        Args:
-+            device_id: gpu device_id or CPU_INDEX
-+            global_slices: slice/ints/tuple/list<int>, use the same format as advance indexing of numpy
-+            value: the data to set
-+
-+        Return
-+            :class:`cupy.ndarray` or :class:`numpy.array` object
-+        """
-+        # check if there is a mapping
-+        if self._indices_map[device_id] is None:
-+            self._buffer[device_id].__setitem__(global_slices, value)
-+        else:
-+            # map global slices to local slices
-+            subarray_index, local_slices = self.map_local_slices(device_id, global_slices)
-+            self._buffer[device_id][subarray_index].__setitem__(local_slices, value)
-+
-+
-+    def _move_data(self, copy_func, dst: int, src: int, subarray_index: int, dst_slices: SlicesType, src_slices: SlicesType, dst_is_current_device:bool = True):
-+        """
-+        Helper function for copy_data_between_device
-+        """
-+        if dst_is_current_device:
-+            if dst_slices is None and src_slices is None:  # Complete to Complete
-+                self._buffer[dst] = copy_func(self._buffer[src])
-+            elif dst_slices is None and src_slices is not None:  # Incomplete to Complete
-+                self._buffer[dst][src_slices] = copy_func(self._buffer[src][subarray_index])
-+            elif dst_slices is not None and src_slices is None:  # Complete to incomplete
-+                if self._buffer[dst] is None:
-+                    self._buffer[dst] = []
-+                self._buffer[dst].append(copy_func(self._buffer[src][dst_slices]))
-+            else:  # incomplete to incomplete
-+                raise ValueError("Copy from subarray to subarray is unsupported")
-+        else:
-+            with cupy.cuda.Device(dst):  # switch device
-+                if dst_slices is None and src_slices is None:  # Complete to Complete
-+                    self._buffer[dst] = copy_func(self._buffer[src])
-+                elif dst_slices is None and src_slices is not None:  # Incomplete to Complete
-+                    self._buffer[dst][src_slices] = copy_func(self._buffer[src][subarray_index])
-+                elif dst_slices is not None and src_slices is None:  # Complete to incomplete
-+                    if self._buffer[dst] is None:
-+                        self._buffer[dst] = []
-+                    self._buffer[dst].append(copy_func(self._buffer[src][dst_slices]))
-+                else:  # incomplete to incomplete
-+                    raise ValueError("Copy from subarray to subarray is unsupported")
-+
-+    def copy_data_between_device(self, dst: int, src: int, dst_is_current_device: bool = True) -> None:
-+        """
-+        Copy data from src to dst.
-+
-+        dst is current device if `dst_is_current_device` is True
-+        """
-+        # a function to copy data between GPU devices async
-+        def copy_from_device_async(src):
-+            dst_data = cupy.empty_like(src)
-+            dst_data.data.copy_from_device_async(src.data, src.nbytes)
-+            return dst_data
-+
-+        if self._indices_map[src] is None:
-+            src_slices_list = [None]
-+        else:
-+            src_slices_list = [self.get_global_slices(src, i) for i in range(len(self._indices_map[src]))]
-+
-+        # TRICK: if there are multiple subarray in this device, always pick the last one
-+        # this is because load of data always comes together with create indices mapping
-+        # so the indices mapping will put at the end of self._indices_map
-+        dst_slices = self.get_global_slices(dst, -1)
-+
-+        for subarray_index in range(len(src_slices_list)):
-+            src_slices = src_slices_list[subarray_index]
-+            if src == CPU_INDEX:  # copy from CPU to GPU
-+                self._move_data(cupy.asarray, dst, src, subarray_index, dst_slices, src_slices, dst_is_current_device)
-+            elif dst != CPU_INDEX:  # copy from GPU to GPU
-+                self._move_data(copy_from_device_async, dst, src, subarray_index, dst_slices, src_slices, dst_is_current_device)
-+            else:  # copy from GPU to CPU
-+                self._move_data(cupy.asnumpy, dst, src, subarray_index, dst_slices, src_slices)  # dst_is_current_device is no need if dst is CPU
-+        self._cyparray_state.set_exist_on_device(dst, True)
-+
-+    def get_slices_hash(self, global_slices: SlicesType) -> int:
-+        """
-+        Get hash value of a slices of complete array.
-+
-+        This could be done by replaing list and slice to tuple
-+        """
-+        # little chance to have collision, but what if it happened?
-+        hash_value = 17 # use a none zero hash value, so hash(0) != 0
-+        prime = 31
-+        if not isinstance(global_slices, tuple):
-+            if isinstance(global_slices, list):
-+                # Built-int hash() method might return negtive value.
-+                # c_size_t is to ensure it is not negative
-+                hash_value = hash_value * prime + ctypes.c_size_t(hash(tuple(global_slices))).value
-+            elif isinstance(global_slices, slice):
-+                hash_value = hash_value * prime + ctypes.c_size_t(hash(global_slices.indices(self.shape[0]))).value
-+            else:
-+                hash_value = hash_value * prime + ctypes.c_size_t(hash(global_slices)).value
-+        else:
-+            if len(self.shape) < len(global_slices):
-+                raise IndexError(f"index out of range, index:{global_slices}")
-+
-+            for d in range(len(global_slices)):
-+                index = global_slices[d]
-+                if isinstance(index, list):
-+                    hash_value = hash_value * prime + ctypes.c_size_t(hash(tuple(index))).value
-+                elif isinstance(index, slice):
-+                    hash_value = hash_value * prime + ctypes.c_size_t(hash(index.indices(self.shape[d]))).value
-+                else:
-+                    hash_value = hash_value * prime + ctypes.c_size_t(hash(index)).value
-+
-+        return hash_value
-+
-+    def __str__(self):
-+        return str(self._buffer)
-+
-+    def __contains__(self, device_id):
-+        """
-+        Return True if there is a copy in this device
-+        """
-+        return device_id in self._buffer and self._buffer[device_id] is not None
-+
-+    def clear(self, device_id) -> None:
-+        """
-+        Clear data in device_id
-+        """
-+        import gc
-+        mempool = cupy.get_default_memory_pool()
-+        pinned_mempool = cupy.get_default_pinned_memory_pool()
-+        print("inside evict():", mempool.used_bytes(), flush=True)
-+        pritn("total:", mempool.total_bytes())
-+        print("device id:", device_id, flush=True)
-+#print("inside pinned evict():", pinned_mempool.used_bytes(), flush=True)
-+        print("evicting array size:", self._buffer[device_id].nbytes)
-+        print("type:", type(self._buffer[device_id]))
-+        del self._buffer[device_id]
-+        print("After inside evict():", mempool.used_bytes(), flush=True)
-+#print("After inside pinned evict():", pinned_mempool.used_bytes(), flush=True)
-+        self._indices_map[device_id] = None
-+        self._buffer[device_id] = None
-+        self._cyparray_state.set_exist_on_device(device_id, False)
-diff --git a/src/python/parla/common/parray/core.py b/src/python/parla/common/parray/core.py
-index e25396c..8d89075 100644
---- a/src/python/parla/common/parray/core.py
-+++ b/src/python/parla/common/parray/core.py
-@@ -120,6 +120,8 @@ class PArray:
-         # Note(@dialecticDolt):It should be valid to create PArrays outside of a scheduler context!!
-         # FIXME
- 
-+        # Register this PArray to tracker and make a link between
-+        # C PArray instance.
-         scheduler = get_scheduler()
-         if scheduler is None:
-             raise NotImplementedError(
-@@ -129,6 +131,7 @@ class PArray:
-         num_devices = len(scheduler.device_manager.get_all_devices())
-         self._cy_parray = CyPArray(
-             self, self.ID, self.parent_ID, self.parent, self._cyparray_state, num_devices)
-+        # record the size in Cython PArray
-         self._cy_parray.set_size(self.subarray_nbytes)
- 
-         target_dev_id = - \
-@@ -301,7 +304,7 @@ class PArray:
-               f"Parent_ID: {self.parent_ID if self.ID != self.parent_ID else None}, "
-               f"Slice: {self._slices[0] if self.ID != self.parent_ID else None}, "
-               f"Bytes: {self.subarray_nbytes}, "
--              f"Owner: {'GPU ' + str(self._coherence.owner) if self._coherence.owner != CPU_INDEX else 'CPU'}")
-+              f"Owner: {'GPU ' + str(self._coherence.owner) if self._coherence.owner != CPU_INDEX else 'CPU'}", flush=True)
-         for device_id, state in self._coherence._local_states.items():
-             if device_id == CPU_INDEX:
-                 device_name = "CPU"
-@@ -311,13 +314,13 @@ class PArray:
- 
-             if isinstance(state, dict):
-                 print(
--                    f"state: {[state_str_map[s] for s in list(state.values())]}, including sliced copy:  # states of slices is unordered wrt the below slices")
-+                    f"state: {[state_str_map[s] for s in list(state.values())]}, including sliced copy:  # states of slices is unordered wrt the below slices", flush=True)
-                 for slice, slice_id in zip(self._array._indices_map[device_id], range(len(self._array._indices_map[device_id]))):
-                     print(
--                        f"\tslice {slice_id} - indices: {slice}, bytes: {self._array._buffer[device_id][slice_id].nbytes}")
-+                        f"\tslice {slice_id} - indices: {slice}, bytes: {self._array._buffer[device_id][slice_id].nbytes}", flush=True)
-             else:
--                print(f"state: {state_str_map[state]}")
--        print("---End of Overview")
-+                print(f"state: {state_str_map[state]}", flush=True)
-+        print("---End of Overview", flush=True)
- 
-     # slicing/indexing
- 
-@@ -395,7 +398,7 @@ class PArray:
- 
-         with self._coherence_cv[device_id]:
-             operations = self._coherence.evict(device_id, keep_one_copy)
--            if operations[0].inst == MemoryOperation.ERROR:
-+            if len(operations) != 0 and operations[0].inst == MemoryOperation.ERROR:
-                 return False  # cannot perform the eviction
-             self._process_operations(operations)
- 
-@@ -479,11 +482,21 @@ class PArray:
-                 #     f"Evicting {self.name} from {op.src}, size: {to_free} bytes", flush=True)
- 
-                 scheduler = get_scheduler()
--                if (to_free > 0) and (scheduler is not None):
--                    # This frees the memory on the device in the mapped and reserved pools
--                    scheduler.device_manager.free_memory(op.src, to_free)
--                    # TODO(wlr): This is only for explictly evicted PArrays. PArrays that fall out of scope need to be freed as well.
--
-+                if scheduler is not None:
-+                    if to_free > 0:
-+                        # This frees the memory on the device in the mapped and reserved pools
-+                        scheduler.device_manager.free_memory(op.src, to_free)
-+                        # TODO(wlr): This is only for explictly evicted PArrays. PArrays that fall out of scope need to be freed as well.
-+                    src_global_dev_id =
-+                        scheduler.device_manager.parrayid_to_globalid(op.src)
-+                    if self._cy_parray.get_num_referring_tasks(src_global_dev_id) == 0:
-+                        # If none of active tasks refers this PArray,
-+                        # remove this PArray on the src device from
-+                        # the PArray tracker's table.
-+                        scheduler.remove_parray_from_tracker(
-+                            self._cy_parray, src_global_dev_id)
-+                # decrement the reference counter, relying on GC to free the memor
-+                self._array.clear(op.src)
-             elif op.inst == MemoryOperation.ERROR:
-                 raise RuntimeError(
-                     "PArray gets an error from coherence protocol")
-@@ -906,5 +919,11 @@ class PArray:
-     def get_parray_parentid_from_cpp(self):
-         return self._cy_parray.get_parray_parentid()
- 
--    def get_num_active_tasks(self, global_dev_id):
--        return self._cy_parray.get_num_active_tasks(global_dev_id)
-+    def get_num_referring_tasks(self, global_dev_id):
-+        return self._cy_parray.get_num_referring_tasks(global_dev_id)
-+
-+    def __del__(self):
-+        # Users can explicitly call `del` over a Python PArray.
-+        # In this case, detroy its array instance.
-+        # TODO(hc): This code is not tested yet
-+        self._array = None
-diff --git a/src/python/parla/common/parray/memory.py b/src/python/parla/common/parray/memory.py
-index 05cef02..89bf74c 100644
---- a/src/python/parla/common/parray/memory.py
-+++ b/src/python/parla/common/parray/memory.py
-@@ -573,3 +573,9 @@ class MultiDeviceBuffer:
-         self._buffer[device_id] = None
-         self._cyparray_state.set_exist_on_device(device_id, False)
-         return to_free
-+
-+    def __del__(self):
-+        for i in range(0, len(self._buffer)):
-+            self._cyparray_state.set_exist_on_device(i, False)
-+        self._indices_map = None
-+        self._buffer = None
-diff --git a/src/python/parla/cython/CMakeLists.txt b/src/python/parla/cython/CMakeLists.txt
-index 431fc58..7378df9 100644
---- a/src/python/parla/cython/CMakeLists.txt
-+++ b/src/python/parla/cython/CMakeLists.txt
-@@ -14,6 +14,7 @@ add_cython_target(tasks tasks.pyx CXX PY3)
- add_cython_target(scheduler scheduler.pyx CXX PY3)
- add_cython_target(device device.pyx CXX PY3)
- add_cython_target(device_manager device_manager.pyx CXX PY3)
-+add_cython_target(mm mm.pyx CXX PY3)
- add_cython_target(cyparray_state cyparray_state.pyx CXX PY3)
- add_cython_target(cyparray cyparray.pyx CXX PY3)
- add_cython_target(variants variants.pyx CXX PY3)
-@@ -23,6 +24,7 @@ add_library(tasks MODULE ${tasks})
- add_library(scheduler MODULE ${scheduler})
- add_library(device MODULE ${device})
- add_library(device_manager MODULE ${device_manager})
-+add_library(mm MODULE ${mm})
- add_library(cyparray_state MODULE ${cyparray_state})
- add_library(cyparray MODULE ${cyparray})
- add_library(variants MODULE ${variants})
-@@ -37,6 +39,7 @@ install(TARGETS tasks LIBRARY DESTINATION ${PYTHON_RELATIVE_SITE_PACKAGES_DIR}/p
- install(TARGETS scheduler LIBRARY DESTINATION ${PYTHON_RELATIVE_SITE_PACKAGES_DIR}/parla/cython/)
- install(TARGETS device LIBRARY DESTINATION ${PYTHON_RELATIVE_SITE_PACKAGES_DIR}/parla/cython/)
- install(TARGETS device_manager LIBRARY DESTINATION ${PYTHON_RELATIVE_SITE_PACKAGES_DIR}/parla/cython/)
-+install(TARGETS mm LIBRARY DESTINATION ${PYTHON_RELATIVE_SITE_PACKAGES_DIR}/parla/cython/)
- install(TARGETS cyparray_state LIBRARY DESTINATION ${PYTHON_RELATIVE_SITE_PACKAGES_DIR}/parla/cython/)
- install(TARGETS cyparray LIBRARY DESTINATION ${PYTHON_RELATIVE_SITE_PACKAGES_DIR}/parla/cython/)
- install(TARGETS variants LIBRARY DESTINATION ${PYTHON_RELATIVE_SITE_PACKAGES_DIR}/parla/cython/)
-@@ -89,6 +92,14 @@ target_include_directories(device_manager PUBLIC ${NumPy_INCLUDE_DIRS})
- target_include_directories(device_manager PUBLIC ${PYTHON_INCLUDE_DIRS})
- target_include_directories(device_manager PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
- 
-+target_link_libraries(mm ${PYTHON_LIBRARIES})
-+target_link_libraries(mm backend)
-+target_include_directories(mm PUBLIC ${PROJECT_SOURCE_DIR}/src/c/backend)
-+target_include_directories(mm PUBLIC ${PROJECT_SOURCE_DIR}/src/c/backend/include)
-+target_include_directories(mm PUBLIC ${NumPy_INCLUDE_DIRS})
-+target_include_directories(mm PUBLIC ${PYTHON_INCLUDE_DIRS})
-+target_include_directories(mm PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
-+
- target_link_libraries(cyparray_state ${PYTHON_LIBRARIES})
- target_link_libraries(cyparray_state backend)
- target_include_directories(cyparray_state PUBLIC ${PROJECT_SOURCE_DIR}/src/c/backend)
-@@ -119,6 +130,7 @@ python_extension_module(tasks)
- python_extension_module(scheduler)
- python_extension_module(device)
- python_extension_module(device_manager)
-+python_extension_module(mm)
- python_extension_module(cyparray_state)
- python_extension_module(cyparray)
- python_extension_module(variants)
-diff --git a/src/python/parla/cython/core.pxd b/src/python/parla/cython/core.pxd
-index 9643d9f..76a3f75 100644
---- a/src/python/parla/cython/core.pxd
-+++ b/src/python/parla/cython/core.pxd
-@@ -4,6 +4,7 @@ cimport cython
- from parla.cython.device_manager cimport DeviceManager
- from parla.cython.device cimport Device, CyDevice
- from parla.cython.cyparray cimport InnerPArray
-+from parla.cython.mm cimport LRUGlobalEvictionManager
- 
- from libc.stdint cimport uint32_t, uint64_t, int64_t
- from libcpp  cimport bool
-@@ -22,6 +23,7 @@ cdef extern from "include/gpu_utility.hpp" nogil:
-     void gpu_busy_sleep(const int device, const unsigned long cycles,
-                     uintptr_t stream_ptr)
- 
-+
- cdef extern from "include/runtime.hpp" nogil:
-     ctypedef void (*launchfunc_t)(void* py_scheduler, void* py_task, void* py_worker)
-     ctypedef void (*stopfunc_t)(void*)
-@@ -130,15 +132,19 @@ cdef extern from "include/runtime.hpp" nogil:
- 
-         bool should_run
-         
--        InnerScheduler(DeviceManager* cpp_device_manager)
-+        InnerScheduler(LRUGlobalEvictionManager* cpp_memory_manager,  DeviceManager* cpp_device_manager)
- 
-         void set_num_workers(int num_workers)
-         void set_py_scheduler(void* py_scheduler)
-         void set_stop_callback(stopfunc_t func)
- 
-+        bool get_should_run()
-+
-         void run() except +
-         void stop()
- 
-+        long long int get_memory_size_to_evict(int dev_id) except +
-+
-         void activate_wrapper()
- 
-         void spawn_task(InnerTask* task)
-@@ -149,7 +155,6 @@ cdef extern from "include/runtime.hpp" nogil:
-         void task_cleanup_presync(InnerWorker* worker, InnerTask* task, int state) except +
-         void task_cleanup_postsync(InnerWorker* worker, InnerTask* task, int state) except +
- 
--        int get_num_active_tasks()
-         void increase_num_active_tasks()
-         void decrease_num_active_tasks()
- 
-@@ -168,7 +173,10 @@ cdef extern from "include/runtime.hpp" nogil:
-         void spawn_wait()
- 
-         void create_parray(InnerPArray* parray, int parray_dev_id)
-+        void remove_parray_from_tracker(InnerPArray* parray, int dev_id)
-         
-+        void invoke_all_cparrays_clear()
-+        bool get_all_pyparrays_clear_flag()
- 
- 
- cdef extern from "include/profiling.hpp" nogil:
-diff --git a/src/python/parla/cython/core.pyx b/src/python/parla/cython/core.pyx
-index d2ac9f9..68891c5 100644
---- a/src/python/parla/cython/core.pyx
-+++ b/src/python/parla/cython/core.pyx
-@@ -12,6 +12,7 @@ from parla.common.globals import AccessMode
- from parla.cython.device cimport Device
- from parla.cython.cyparray cimport CyPArray
- from parla.cython.device_manager cimport CyDeviceManager, DeviceManager
-+from parla.cython.mm cimport CyMM
- import threading
- from enum import IntEnum, auto
- from parla.common.globals import cupy
-@@ -585,14 +586,15 @@ cdef class PyInnerWorker:
- cdef class PyInnerScheduler:
-     cdef InnerScheduler* inner_scheduler
- 
--    def __cinit__(self, CyDeviceManager cy_device_manager, int num_workers, float vcus, object python_scheduler):
-+    def __cinit__(self, CyMM cy_memory_manager, CyDeviceManager cy_device_manager, int num_workers, float vcus, object python_scheduler):
-         cdef InnerScheduler* _inner_scheduler
-         cdef DeviceManager* _cpp_device_manager = <DeviceManager*> cy_device_manager.get_cpp_device_manager()
-+        cdef LRUGlobalEvictionManager* _cpp_memory_manager = <LRUGlobalEvictionManager*> cy_memory_manager.get_cpp_memory_manager()
- 
--        _inner_scheduler = new InnerScheduler(_cpp_device_manager)
-+        _inner_scheduler = new InnerScheduler(_cpp_memory_manager, _cpp_device_manager)
-         self.inner_scheduler = _inner_scheduler
- 
--    def __init__(self, CyDeviceManager cy_device_manager, int num_workers, float vcus, object python_scheduler):
-+    def __init__(self, CyMM cy_memory_manager, CyDeviceManager cy_device_manager, int num_workers, float vcus, object python_scheduler):
-         cdef InnerScheduler* _inner_scheduler
-         _inner_scheduler = self.inner_scheduler
- 
-@@ -610,6 +612,14 @@ cdef class PyInnerScheduler:
-     def __dealloc__(self):
-         del self.inner_scheduler
- 
-+    cpdef get_should_run(self):
-+        """
-+        This function checks whether there are remaining tasks
-+        in C scheduler queues.
-+        """
-+        cdef InnerScheduler* c_self = self.inner_scheduler
-+        return c_self.get_should_run()
-+
-     cpdef run(self):
-         cdef InnerScheduler* c_self = self.inner_scheduler
-         with nogil:
-@@ -661,10 +671,6 @@ cdef class PyInnerScheduler:
-         with nogil:
-             c_self.task_cleanup_postsync(c_worker, c_task, state)
- 
--    cpdef get_num_active_tasks(self):
--        cdef InnerScheduler* c_self = self.inner_scheduler
--        return c_self.get_num_active_tasks()
--
-     cpdef increase_num_active_tasks(self):
-         cdef InnerScheduler* c_self = self.inner_scheduler
-         c_self.increase_num_active_tasks()
-@@ -720,6 +726,18 @@ cdef class PyInnerScheduler:
-         cdef InnerScheduler* c_self = self.inner_scheduler
-         return c_self.get_reserved_memory(global_dev_id)
- 
-+    cpdef get_memory_size_to_evict(self, int global_dev_id):
-+        cdef InnerScheduler* c_self = self.inner_scheduler
-+        return c_self.get_memory_size_to_evict(global_dev_id)
-+
-+    cpdef invoke_all_cparrays_clear(self):
-+        cdef InnerScheduler* c_self = self.inner_scheduler
-+        c_self.invoke_all_cparrays_clear()
-+
-+    cpdef get_all_pyparrays_clear_flag(self):
-+        cdef InnerScheduler* c_self = self.inner_scheduler
-+        return c_self.get_all_pyparrays_clear_flag()
-+
- 
- class Resources:
- 
-diff --git a/src/python/parla/cython/cyparray.pxd b/src/python/parla/cython/cyparray.pxd
-index 73223bc..06e7422 100644
---- a/src/python/parla/cython/cyparray.pxd
-+++ b/src/python/parla/cython/cyparray.pxd
-@@ -12,12 +12,12 @@ cdef extern from "include/parray.hpp" namespace "parray":
-     cdef cppclass InnerPArray:
-         InnerPArray(void *, uint64_t, uint64_t, InnerPArray *, PArrayState *i, uint32_t) except +
-         void set_size(uint64_t)
--        uint64_t get_num_active_tasks(uint32_t global_dev_id) except +
-+        uint64_t get_num_referring_tasks(uint32_t global_dev_id) except +
-         const uint64_t get_parent_id() except +
- 
- cdef class CyPArray:
-     # Hold a C++ instance which we're wrapping
-     cdef InnerPArray* cpp_parray
-     cdef InnerPArray* get_cpp_parray(self)
--    cpdef get_num_active_tasks(self, int global_dev_id)
-+    cpdef get_num_referring_tasks(self, int global_dev_id)
-     cpdef get_parray_parentid(self)
-diff --git a/src/python/parla/cython/cyparray.pyx b/src/python/parla/cython/cyparray.pyx
-index 19e9fee..ad13d6a 100644
---- a/src/python/parla/cython/cyparray.pyx
-+++ b/src/python/parla/cython/cyparray.pyx
-@@ -37,8 +37,8 @@ cdef class CyPArray:
-     cdef InnerPArray* get_cpp_parray(self):
-         return self.cpp_parray
- 
--    cpdef get_num_active_tasks(self, int global_dev_id):
--        return self.cpp_parray.get_num_active_tasks(global_dev_id)
-+    cpdef get_num_referring_tasks(self, int global_dev_id):
-+        return self.cpp_parray.get_num_referring_tasks(global_dev_id)
- 
-     cpdef get_parray_parentid(self):
-         return self.cpp_parray.get_parent_id()
-diff --git a/src/python/parla/cython/mm.pxd b/src/python/parla/cython/mm.pxd
-new file mode 100644
-index 0000000..8a8670a
---- /dev/null
-+++ b/src/python/parla/cython/mm.pxd
-@@ -0,0 +1,13 @@
-+from parla.cython.device_manager cimport DeviceManager
-+
-+cdef extern from "include/memory_manager.hpp" nogil:
-+    cdef cppclass LRUGlobalEvictionManager:
-+        LRUGlobalEvictionManager(DeviceManager *)
-+        unsigned long long size(unsigned int device_id)
-+        void *remove_and_return_head_from_zrlist(unsigned int device_id)
-+
-+cdef class CyMM:
-+    cdef LRUGlobalEvictionManager* _inner_mm
-+    cpdef size(self, int dev_id)
-+    cpdef remove_and_return_head_from_zrlist(self, int dev_id)
-+    cdef LRUGlobalEvictionManager* get_cpp_memory_manager(self)
-diff --git a/src/python/parla/cython/mm.pyx b/src/python/parla/cython/mm.pyx
-new file mode 100644
-index 0000000..268e7cf
---- /dev/null
-+++ b/src/python/parla/cython/mm.pyx
-@@ -0,0 +1,62 @@
-+
-+from parla.cython import device_manager
-+
-+from parla.cython.core cimport LRUGlobalEvictionManager
-+from parla.cython cimport device_manager
-+#from parla.cython.core import LRUGlobalEvictionManager
-+
-+class PyMM:
-+    def __init__(self, dm: device_manager.PyDeviceManager):
-+        self._device_manager = device_manager
-+        self._cy_mm = CyMM(dm.get_cy_device_manager())
-+
-+    def size(self, dev_id: int):
-+        return self._cy_mm.size(dev_id)
-+
-+    def remove_and_return_head_from_zrlist(self, dev_id: int):
-+        return self._cy_mm.remove_and_return_head_from_zrlist(dev_id)
-+
-+    def get_cy_memory_manager(self):
-+        return self._cy_mm
-+
-+    def print_memory_stats(self, device_id, label: str):
-+        import psutil
-+        import os
-+        print(f"[{label}] Memory tracking", flush=True)
-+        try:
-+            import cupy
-+            mempool = cupy.get_default_memory_pool()
-+            pinned_mempool = cupy.get_default_pinned_memory_pool()
-+            print((
-+                  f"\t GPU{device_id} {label} CuPy used bytes: {mempool.used_bytes()} \n"
-+                  f"\t GPU{device_id} {label} Free bytes: {mempool.free_bytes()} \n"
-+                  f"\t GPU{device_id} {label} Total bytes: {mempool.total_bytes()} \n"), flush=True)
-+        except ImportError:
-+            print("MM tracker only supports CuPy memory status checking.", flush=True)
-+
-+
-+cdef class CyMM:
-+
-+    def __cinit__(self, device_manager.CyDeviceManager cy_dm):
-+        self._inner_mm = new LRUGlobalEvictionManager(cy_dm.get_cpp_device_manager())
-+
-+    def __dealloc__(self):
-+        del self._inner_mm
-+
-+    cpdef size(self, int dev_id):
-+        cdef LRUGlobalEvictionManager* c_self = self._inner_mm
-+        return c_self.size(dev_id)
-+
-+    cpdef remove_and_return_head_from_zrlist(self, int dev_id):
-+        cdef LRUGlobalEvictionManager* c_self = self._inner_mm
-+        cdef void* py_parray = c_self.remove_and_return_head_from_zrlist(dev_id)
-+        if py_parray == NULL:
-+            # TODO(hc): This path is actually not used.
-+            # It would be great if we can check if this python object is valid
-+            # at here; it can simplify our current mechanism a lot.
-+            return None
-+        else:
-+            return <object> py_parray
-+
-+    cdef LRUGlobalEvictionManager* get_cpp_memory_manager(self):
-+        return self._inner_mm
-diff --git a/src/python/parla/cython/scheduler.pyx b/src/python/parla/cython/scheduler.pyx
-index 3390240..db2caae 100644
---- a/src/python/parla/cython/scheduler.pyx
-+++ b/src/python/parla/cython/scheduler.pyx
-@@ -21,6 +21,7 @@ from parla.cython import tasks
- cimport core
- from parla.cython import core
- from parla.cython.cyparray import CyPArray
-+from parla.cython.mm import PyMM
- 
- from parla.common.globals import _Locals as Locals 
- from parla.common.globals import USE_PYTHON_RUNAHEAD, _global_data_tasks, PREINIT_THREADS
-@@ -231,11 +232,11 @@ class WorkerThread(ControllableThread, SchedulerContext):
-                         #print("Setting environment for task", active_task, flush=True)
-                         active_task.environment = device_context
- 
--
-                         #Writes all 'default' streams and event pointers to c++ task
-                         #This allows their synchronization without the GIL and faster iteration over them
-                         #(only saves initial runtime ones, TODO(wlr): save any user added events or streams after body returns)
-                         device_context.write_to_task(active_task)
-+
-                         #print("Wrote enviornment to task", active_task, flush=True)
- 
-                         #handle event wait in python 
-@@ -294,6 +295,7 @@ class WorkerThread(ControllableThread, SchedulerContext):
-                         elif  isinstance(final_state, tasks.TaskRunahead):
-                             core.binlog_2("Worker", "Runahead task: ", active_task.inner_task, " on worker: ", self.inner_worker)
-                     
-+                        #TODO(wlr): Add better exception handling
-                         #print("Cleaning up Task", active_task, flush=True)
-                         
-                         if USE_PYTHON_RUNAHEAD:
-@@ -314,13 +316,13 @@ class WorkerThread(ControllableThread, SchedulerContext):
-                         if isinstance(final_state, tasks.TaskRunahead):
-                             final_state = tasks.TaskCompleted(final_state.return_value)
-                             active_task.cleanup()
--
-                             core.binlog_2("Worker", "Completed task: ", active_task.inner_task, " on worker: ", self.inner_worker)
- 
-                         # print("Finished Task", active_task, flush=True)
-                         active_task.state = final_state
-                         self.task = None
- 
-+                        self.task = None
-                         nvtx.pop_range(domain="Python Runtime")
-                     elif self._should_run:
-                         raise WorkerThreadException("%r Worker: Woke without a task", self.index)
-@@ -350,7 +352,7 @@ class WorkerThread(ControllableThread, SchedulerContext):
- 
- class Scheduler(ControllableThread, SchedulerContext):
- 
--    def __init__(self, device_manager, n_threads=6, period=0.001):
-+    def __init__(self, memory_manager, device_manager, n_threads=6, period=0.001):
-         super().__init__()
- 
-         self.start_monitor = threading.Condition(threading.Lock())
-@@ -364,9 +366,28 @@ class Scheduler(ControllableThread, SchedulerContext):
-         #TODO: Handle resources better
-         resources = 1.0
- 
-+        self.memory_manager = memory_manager
-         self.device_manager = device_manager
-+        cy_memory_manager = self.memory_manager.get_cy_memory_manager()
-         cy_device_manager = self.device_manager.get_cy_device_manager()
--        self.inner_scheduler = PyInnerScheduler(cy_device_manager, n_threads, resources, self)
-+        self.inner_scheduler = PyInnerScheduler(cy_memory_manager,
-+                                                cy_device_manager,
-+                                                n_threads,
-+                                                resources, self)
-+        # This holds PArray references.
-+        # Through this, it makes a scheduler control a PArray's life cycle.
-+        # For example, this holds the last reference of a PArray and so,
-+        # scheduler (or memory manager) can have the control of thats
-+        # deallocation.
-+        # TODO(hc): However, for now, we only support reset this dictionary
-+        #           not deallocating a single PArray.
-+        #           For now, instead we deallocate PArray instance by
-+        #           removing the internal array reference through evict().
-+        #           In the future, we will have better design for this.
-+        self.active_parrays = {}
-+        # Worker threads and a scheduler both can access the active_parrays
-+        # and so we need a lock to guard that.
-+        self.active_parrays_monitor = threading.Condition(threading.Lock())
- 
-         self.worker_threads = [WorkerThread(self, i) for i in range(n_threads)]
- 
-@@ -382,6 +403,58 @@ class Scheduler(ControllableThread, SchedulerContext):
-     def scheduler(self):
-         return self
- 
-+    def append_active_parray(self, parray: PArray):
-+        """ Append a PArray reference.
-+
-+            :param parray: PArray to be appended
-+        """
-+        with self.active_parrays_monitor:
-+            self.active_parrays[parray.ID] = parray
-+
-+    def remove_active_parray(self, parray: PArray):
-+        """ Remove a PArray reference.
-+
-+            :param parray: PArray to be removed 
-+        """
-+        with self.active_parrays_monitor:
-+            self.active_parrays[parray.ID] = None
-+
-+    def clear_active_parrays(self):
-+        """ Clear all references from active_parrays dictionary.
-+        """
-+        with self.active_parrays_monitor:
-+            import cupy
-+            for k in range(0, 4):
-+                with cupy.cuda.Device(k):
-+                    mempool = cupy.get_default_memory_pool()
-+                    print(f"\t {k} Used: {mempool.used_bytes()}, Free: {mempool.free_bytes()}", flush=True)
-+            # TODO(hc): This is unstable hack.
-+            # This loop is necessary since a worker thread 
-+            # can proceed with the next task graph
-+            # (in case of benchmark/python/benchmark.py)
-+            # and generate and allocate new data while a scheduler
-+            # who is the caller of this function holds its reference.
-+            # It seems like there is a concurrency issue; for example,
-+            # removing active_parrays' reference is not immediately
-+            # caught by a gc. The safest way that I am using is to evict
-+            # each of PArrays at here.
-+            # But still it is unstable way and I will replace this soon.
-+            for k, v in self.active_parrays.items():
-+                for dev in self.device_manager.get_all_devices():
-+                    global_dev_id = dev.get_global_id()
-+                    parray_dev_id = self.device_manager.globalid_to_parrayid(global_dev_id)
-+                    v.evict(parray_dev_id)
-+            # TODO(hc): I am not sure why the above loop is necessary..
-+            #           But otherwise, it doesnt deallocate cupy arrays.
-+            # TODO(hc): This is not considering concurrent execution between
-+            #           a thread and a scheduler..
-+            #           The main thread can allocate new data while or before the scheduler
-+            #           deallocates the old PArrays through this.
-+            for k in range(0, 4):
-+                with cupy.cuda.Device(k):
-+                    mempool = cupy.get_default_memory_pool()
-+                    print(f"\t After {k} Used: {mempool.used_bytes()}, Free: {mempool.free_bytes()}", flush=True)
-+
-     def get_device_reqs_from_placement(self, placement, vcus, memory):
-         return self.device_manager.get_device_reqs_from_placement(placement, vcus, memory)
- 
-@@ -414,10 +487,73 @@ class Scheduler(ControllableThread, SchedulerContext):
-             pass
-             #print("Runtime Stopped", flush=True)
- 
-+    def parray_eviction(self):
-+        py_mm = self.memory_manager
-+        print("Eviction policy is activated")
-+        for cuda_device in self.device_manager.get_devices(DeviceType.CUDA):
-+            global_id = cuda_device.get_global_id()
-+            parray_id = self.device_manager.globalid_to_parrayid(global_id)
-+            # Get target memory size to evict from this device
-+            memory_size_to_evict = \
-+                self.inner_scheduler.get_memory_size_to_evict(global_id)
-+            # Get the number of PArray candidates that are allowed to be evicted
-+            # from Python eviction manager.
-+            num_evictable_parray = py_mm.size(global_id)
-+            # TODO(hc): remove this. this is for test.
-+            import cupy
-+            for i in range(0, num_evictable_parray):
-+                try:
-+                    # Get a PArray from a memory manager to evict.
-+                    evictable_parray = \
-+                        py_mm.remove_and_return_head_from_zrlist(global_id)
-+                    if evictable_parray is not None:
-+                        # TODO(hc): remove this. this is for test.
-+                        for k in range(0, 4):
-+                            with cupy.cuda.Device(k):
-+                                mempool = cupy.get_default_memory_pool()
-+                                print(f"\t OK? {k} Used GPU{k}: {mempool.used_bytes()}, Free Mmeory: {mempool.free_bytes()}", flush=True) 
-+
-+                        evictable_parray.evict(parray_id)
-+
-+                        # TODO(hc): remove this. this is for test.
-+                        for k in range(0, 4):
-+                            with cupy.cuda.Device(k):
-+                                mempool = cupy.get_default_memory_pool()
-+                                print(f"\t OK {k} Used GPU{k}: {mempool.used_bytes()}, Free Mmeory: {mempool.free_bytes()}", flush=True) 
-+
-+                        # Repeat eviction until it gets enough memory.
-+                        memory_size_to_evict -= \
-+                            evictable_parray.nbytes_at(parray_id)
-+                        if memory_size_to_evict <= 0:
-+                            break
-+                except Exception as e:
-+                    print("Failed to find parray evictable", flush=True)
-+        return
-+
-     def run(self):
--        #print("Scheduler: Running", flush=True)
--        self.inner_scheduler.run()
--        #print("Scheduler: Stopped Loop", flush=True)
-+        with self:
-+            while True:
-+                print("Scheduler: Running", flush=True)
-+                self.inner_scheduler.run()
-+                if self.inner_scheduler.get_all_pyparrays_clear_flag():
-+                    # All the references of the PArrays held by
-+                    # a Python scheduler should be destroyed 
-+                    # AFTER C++ scheduler (or memory manager) clears
-+                    # all PArray nodes from the list in the eviction manager.
-+                    # TODO(hc): rename the function to is_cparrays_cleared.
-+                    # TODO(hc): this might be done by a worker thread
-+                    #           who will allocate PArrays.
-+                    #           Otherwise, there should be thread concurrent
-+                    #           over memory allocation/deallocation.
-+                    self.clear_active_parrays()
-+                else:
-+                    should_run = self.inner_scheduler.get_should_run()
-+                    if should_run == False:
-+                        break
-+                    # This case is executed if PArray eviction
-+                    # mechanism was invoked by C++ scheduler.
-+                    self.parray_eviction() 
-+            self.stop_callback()
- 
-     def stop(self):
-         #print("Scheduler: Stopping (Called from Python)", flush=True)
-@@ -437,7 +573,6 @@ class Scheduler(ControllableThread, SchedulerContext):
-     def spawn_task(self, task):
-         #print("Scheduler: Spawning Task", task, flush=True)
-         self.inner_scheduler.spawn_task(task.inner_task)
--
-     
-     def assign_task(self, task, worker):
-         task.state = tasks.TaskRunning(task.func, task.args, task.dependencies)
-@@ -449,7 +584,6 @@ class Scheduler(ControllableThread, SchedulerContext):
-     def spawn_wait(self):
-         self.inner_scheduler.spawn_wait()
-         
--
-     def create_parray(self, cy_parray: CyPArray, parray_dev_id: int):
-         """
-         Reserve PArray instances that are created through
-@@ -514,6 +648,20 @@ class Scheduler(ControllableThread, SchedulerContext):
-         return self.inner_scheduler.get_reserved_parray_state( \
-             global_dev_id, parray_parent_id)
- 
-+    def remove_parray_from_tracker(\
-+        self, cy_parray: CyPArray, did: int):
-+        """
-+        Remove the evicted PArray instance on device `global_dev_id`
-+        from the PArray tracker's table
-+
-+        :param cy_parray: Cython PArray instance to be removed 
-+        :param did: global logical device id where the PArray is evicted
-+        """
-+        self.inner_scheduler.remove_parray_from_tracker(cy_parray, did)
-+
-+    def invoke_all_cparrays_clear(self):
-+        self.inner_scheduler.invoke_all_cparrays_clear()
-+
- 
- def _task_callback(task, body):
-     """
-diff --git a/src/python/parla/cython/tasks.pyx b/src/python/parla/cython/tasks.pyx
-index 98bd759..bd71e44 100644
---- a/src/python/parla/cython/tasks.pyx
-+++ b/src/python/parla/cython/tasks.pyx
-@@ -21,6 +21,8 @@ from parla.common.globals import AccessMode, Storage
- from parla.cython.cyparray import CyPArray
- from parla.common.parray.core import PArray
- from parla.common.globals import SynchronizationType as SyncType 
-+from parla.common.globals import _global_data_tasks
-+
- 
- PyDevice = device.PyDevice
- PyCUDADevice = device.PyCUDADevice
-@@ -554,18 +556,25 @@ class Task:
-                 cy_parray = in_parray.cy_parray
-                 self.inner_task.add_parray(cy_parray,
-                     AccessMode.IN, in_parray_devid)
-+                # Add a PArray reference to a dictionary in a scheduler
-+                # to pass its lifecycle.
-+                self.scheduler.append_active_parray(in_parray)
-             for out_parray_tpl in dataflow.output:
-                 out_parray = out_parray_tpl[0]
-                 out_parray_devid = out_parray_tpl[1]
-                 cy_parray = out_parray.cy_parray
-                 self.inner_task.add_parray(cy_parray,
-                     AccessMode.OUT, out_parray_devid)
-+                self.scheduler.append_active_parray(out_parray)
-             for inout_parray_tpl in dataflow.inout:
-                 inout_parray = inout_parray_tpl[0]
-                 inout_parray_devid = inout_parray_tpl[1]
-                 cy_parray = inout_parray.cy_parray
-                 self.inner_task.add_parray(cy_parray,
-                     AccessMode.INOUT, inout_parray_devid)
-+                # TODO(hc): Maybe we can pass dataflow to reduce
-+                #           lock conflicts.
-+                self.scheduler.append_active_parray(inout_parray)
- 
-     def notify_dependents_wrapper(self):
-         """!
-@@ -713,6 +722,7 @@ class DataMovementTask(Task):
-         idx=0, state=TaskCreated(), scheduler=None, name=None):
-         super().__init__(taskspace, idx, state, scheduler, name)
-         self.parray = parray
-+
-         self.access_mode = access_mode
-         self.assigned_devices = assigned_devices
- 
-@@ -748,7 +758,6 @@ class DataMovementTask(Task):
-             self.parray._auto_move(device_manager.get_parray_id(global_device_id),
-                                    write_flag)
-         """
--#self.parray._auto_move(device_manager.get_parray_id(self.dev_id), write_flag)
-         target_dev = self.assigned_devices[0]
-         global_id = target_dev.get_global_id()
-         parray_id = device_manager.globalid_to_parrayid(global_id)
-@@ -758,7 +767,8 @@ class DataMovementTask(Task):
-         return TaskRunahead(0)
- 
-     def cleanup(self):
--        pass
-+        _global_data_tasks[id(self)] = None
-+        self.parray = None
- 
- ######
- # Task Environment
-@@ -1828,9 +1838,3 @@ class BackendTaskSpace(TaskSpace):
- 
-     def wait(self):
-         self.inner_space.wait()
--
--
--
--
--    
--    
-diff --git a/src/python/parla/utility/execute.py b/src/python/parla/utility/execute.py
-index 36825aa..eadcaaf 100644
---- a/src/python/parla/utility/execute.py
-+++ b/src/python/parla/utility/execute.py
-@@ -261,15 +261,16 @@ def synthetic_kernel_gpu(total_time: int, gil_fraction: Union[Fraction, float],
-     return None
- 
- 
--def create_task_no_data(task, taskspaces, config, data_list=None):
-+def create_task_no_data(task, taskspaces, config, ts_postfix=None, data_list=None):
- 
-     try:
-         # Task ID
-         task_idx = task.task_id.task_idx
--        taskspace = taskspaces[task.task_id.taskspace]
-+        ts_postfix_str = "" if ts_postfix is None else ts_postfix
-+        taskspace = taskspaces[task.task_id.taskspace + ts_postfix_str]
- 
-         # Dependency Info
--        dependencies = [taskspaces[dep.taskspace][dep.task_idx]
-+        dependencies = [taskspaces[dep.taskspace + ts_postfix_str][dep.task_idx]
-                         for dep in task.task_dependencies]
- 
-         # Valid Placement Set
-@@ -326,15 +327,16 @@ def create_task_no_data(task, taskspaces, config, data_list=None):
-         return
- 
- 
--def create_task_eager_data(task, taskspaces, config=None, data_list=None):
-+def create_task_eager_data(task, taskspaces, config=None, ts_postfix=None, data_list=None):
- 
-     try:
-         # Task ID
-         task_idx = task.task_id.task_idx
--        taskspace = taskspaces[task.task_id.taskspace]
-+        ts_postfix_str = "" if ts_postfix is None else ts_postfix
-+        taskspace = taskspaces[task.task_id.taskspace + ts_postfix_str]
- 
-         # Dependency Info
--        dependencies = [taskspaces[dep.taskspace][dep.task_idx]
-+        dependencies = [taskspaces[dep.taskspace + ts_postfix_str][dep.task_idx]
-                         for dep in task.task_dependencies]
- 
-         # Valid Placement Set
-@@ -397,14 +399,15 @@ def create_task_eager_data(task, taskspaces, config=None, data_list=None):
-         if config.gil_fraction is not None:
-             gil_fraction = config.gil_fraction
- 
--        # print("Eager data in:", IN, " out:", OUT, " inout:", INOUT, flush=True)
-         """
-+        print("Eager data in:", IN, " out:", OUT, " inout:", INOUT, flush=True)
-         print("task idx:", task_idx, " dependencies:", dependencies, " vcu:", device_fraction,
-             " placement:", placement_set)
--        """
--
-         # TODO(hc): Add data checking.
--        @spawn(taskspace[task_idx], dependencies=dependencies, vcus=device_fraction, placement=[placement_set], input=IN, output=OUT, inout=INOUT)
-+        print("Memory:", IN[0][0].nbytes)
-+        """
-+        # TODO(hc): remove memory operand
-+        @spawn(taskspace[task_idx], dependencies=dependencies, vcus=device_fraction, placement=[placement_set], input=IN, output=OUT, inout=INOUT, memory=IN[0][0].nbytes)
-         async def task_func():
-             if config.verbose:
-                 print(f"+{task.task_id} Running", flush=True)
-@@ -421,15 +424,16 @@ def create_task_eager_data(task, taskspaces, config=None, data_list=None):
-         return
- 
- 
--def create_task_lazy_data(task, taskspaces, config=None, data_list=None):
-+def create_task_lazy_data(task, taskspaces, config=None, ts_postfix=None, data_list=None):
- 
-     try:
-         # Task ID
-         task_idx = task.task_id.task_idx
--        taskspace = taskspaces[task.task_id.taskspace]
-+        ts_postfix_str = "" if ts_postfix is None else ts_postfix
-+        taskspace = taskspaces[task.task_id.taskspace + ts_postfix_str]
- 
-         # Dependency Info
--        dependencies = [taskspaces[dep.taskspace][dep.task_idx]
-+        dependencies = [taskspaces[dep.taskspace + ts_postfix_str][dep.task_idx]
-                         for dep in task.task_dependencies]
- 
-         # Valid Placement Set
-@@ -483,8 +487,11 @@ def create_task_lazy_data(task, taskspaces, config=None, data_list=None):
- 
-         if config.gil_fraction is not None:
-             gil_fraction = config.gil_fraction
-+        """
-         print("task idx:", task_idx, " dependencies:", dependencies, " vcu:", device_fraction,
-               " placement:", placement_set)
-+        
-+        """
- 
-         @spawn(taskspace[task_idx], dependencies=dependencies, vcus=device_fraction, placement=[placement_set])
-         async def task_func():
-@@ -520,7 +527,7 @@ def create_task_lazy_data(task, taskspaces, config=None, data_list=None):
-         return
- 
- 
--def execute_tasks(taskspaces, tasks: Dict[TaskID, TaskInfo], run_config: RunConfig, data_list=None):
-+def execute_tasks(taskspaces, tasks: Dict[TaskID, TaskInfo], run_config: RunConfig, ts_postfix=None, data_list=None):
- 
-     spawn_start_t = time.perf_counter()
- 
-@@ -530,32 +537,36 @@ def execute_tasks(taskspaces, tasks: Dict[TaskID, TaskInfo], run_config: RunConf
-         if run_config.movement_type == MovementType.NO_MOVEMENT:
-             # print("No data movement")
-             create_task_no_data(details, taskspaces,
--                                config=run_config, data_list=data_list)
-+                                config=run_config,
-+                                ts_postfix=ts_postfix,
-+                                data_list=data_list)
-         elif run_config.movement_type == MovementType.EAGER_MOVEMENT:
-             # print("Eager data movement")
-             create_task_eager_data(details, taskspaces,
--                                   config=run_config, data_list=data_list)
-+                                   config=run_config,
-+                                   ts_postfix=ts_postfix,
-+                                   data_list=data_list)
-         elif run_config.movement_type == MovementType.LAZY_MOVEMENT:
-             # print("Lazy data movement")
-             create_task_lazy_data(details, taskspaces,
--                                  config=run_config, data_list=data_list)
--
-+                                  config=run_config,
-+                                  ts_postfix=ts_postfix,
-+                                  data_list=data_list)
-     spawn_end_t = time.perf_counter()
- 
-     return taskspaces
- 
- 
- def execute_graph(data_config: Dict[int, DataInfo], tasks: Dict[TaskID, TaskInfo], run_config: RunConfig, timing: List[TimeSample]):
--
-     @spawn(vcus=0)
-     async def main_task():
- 
-         graph_times = []
-+        # Generate data once for multiple iterations.
-+        data_list = generate_data(
-+            data_config, run_config.data_scale, run_config.movement_type)
- 
-         for i in range(run_config.inner_iterations):
--            data_list = generate_data(
--                data_config, run_config.data_scale, run_config.movement_type)
--
-             # Initialize task spaces
-             taskspaces = {}
- 
-@@ -570,10 +581,76 @@ def execute_graph(data_config: Dict[int, DataInfo], tasks: Dict[TaskID, TaskInfo
- 
-             for taskspace in taskspaces.values():
-                 await taskspace
-+            graph_end_t = time.perf_counter()
-+
-+            #worker_thread = get_scheduler_context()
-+            #worker_thread.scheduler.invoke_all_cparrays_clear()
-+
-+            graph_elapsed = graph_end_t - graph_start_t
-+            graph_times.append(graph_elapsed)
-+
-+        graph_times = np.asarray(graph_times)
-+        graph_t = TimeSample(np.mean(graph_times), np.median(graph_times), np.std(
-+            graph_times), np.min(graph_times), np.max(graph_times), len(graph_times))
-+
-+        timing.append(graph_t)
-+
-+
-+def execute_graph_memory2(data_config: Dict[int, DataInfo], tasks: Dict[TaskID, TaskInfo], run_config: RunConfig, timing: List[TimeSample]):
-+    """
-+    This function creates data for each iteration and intentionally decreases
-+    its reference count.
-+    Data generated in the previous iteration becomes unnecessary and a
-+    scheduler's eviction manager evicts those data to CPU.
-+    This function requires the following consideration;
-+    data should be less than CPU memory. So, the number of iterations or
-+    PArray size should not be high.
-+    TODO(hc): it might be a separate test.
-+    """
-+    @spawn(vcus=0)
-+    async def main_task():
-+
-+        graph_times = []
-+        data_list = []
-+        max_memory = 0
-+        for i in range(0, run_config.inner_iterations):
-+            import cupy
-+            # Initialize task spaces
-+            taskspaces = {}
-+
-+            data_list.append(generate_data(data_config, run_config.data_scale, run_config.movement_type))
-+            ts_postfix = "-"+str(i)
-+            # Create a task space with the postfix of the current iteration.
-+            for task, details in tasks.items():
-+                space_name = details.task_id.taskspace
-+                space_name += ts_postfix
-+                if space_name not in taskspaces:
-+                    taskspaces[space_name] = TaskSpace(space_name)
- 
-+            for k in range(0, 4):
-+                with cupy.cuda.Device(k):
-+                    mempool = cupy.get_default_memory_pool()
-+                    print(f"\t Right Before {k} Used GPU{k}: {mempool.used_bytes()}, Free Mmeory: {mempool.free_bytes()}") 
-+
-+            graph_start_t = time.perf_counter()
-+
-+            execute_tasks(taskspaces, tasks, run_config, ts_postfix, data_list=data_list[i])
-+
-+            for taskspace in taskspaces.values():
-+                await taskspace
-+            taskspace = None
-             graph_end_t = time.perf_counter()
- 
-+            #worker_thread = get_scheduler_context()
-+            #worker_thread.scheduler.invoke_all_cparrays_clear()
-+
-+            for k in range(0, 4):
-+                with cupy.cuda.Device(k):
-+                    mempool = cupy.get_default_memory_pool()
-+                    print(f"\t Right After {k} Used GPU{k}: {mempool.used_bytes()}, Free Mmeory: {mempool.free_bytes()}") 
-+
-             graph_elapsed = graph_end_t - graph_start_t
-+            print("Iteration:", i, ", execution time:", graph_elapsed, flush=True)
-             graph_times.append(graph_elapsed)
- 
-         graph_times = np.asarray(graph_times)
-@@ -597,7 +674,7 @@ def run(tasks: Dict[TaskID, TaskInfo], data_config: Dict[int, DataInfo] = None,
- 
-         with Parla(logfile=run_config.logfile):
-             internal_start_t = time.perf_counter()
--            execute_graph(data_config, tasks, run_config, timing)
-+            execute_graph_memory2(data_config, tasks, run_config, timing)
-             internal_end_t = time.perf_counter()
- 
-         outer_end_t = time.perf_counter()
-@@ -797,12 +874,14 @@ class GraphContext(object):
-         return self
- 
-     def run(self, run_config: RunConfig, max_time: int = 100):
--
-+        return run(self.graph, self.data_config, run_config)
-+        """
-         @timeout(max_time)
-         def run_with_timeout():
-             return run(self.graph, self.data_config, run_config)
- 
-         return run_with_timeout()
-+        """
- 
-     def __exit__(self, type, value, traceback):
-         self.diro.__exit__(type, value, traceback)

From b2c84851003da2ca9b1b93e61bb3005c6f1a1f9f Mon Sep 17 00:00:00 2001
From: wlruys <wlruys@users.noreply.github.com>
Date: Thu, 18 Jan 2024 03:46:59 +0000
Subject: [PATCH 5/5] style: apply ruff format

---
 src/python/parla/common/globals.py          | 1 +
 src/python/parla/common/parray/core.py      | 2 +-
 src/python/parla/common/parray/from_data.py | 4 ++--
 3 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/python/parla/common/globals.py b/src/python/parla/common/globals.py
index 75d9fe41..afc359b0 100644
--- a/src/python/parla/common/globals.py
+++ b/src/python/parla/common/globals.py
@@ -72,6 +72,7 @@ def print_config():
     print("Default Runahead Behavior: ", default_sync, flush=True)
     print("VCU Precision: ", VCU_BASELINE, flush=True)
 
+
 class DeviceType(IntEnum):
     """
     This class declares device types.
diff --git a/src/python/parla/common/parray/core.py b/src/python/parla/common/parray/core.py
index ec251551..3833b7f5 100644
--- a/src/python/parla/common/parray/core.py
+++ b/src/python/parla/common/parray/core.py
@@ -217,7 +217,7 @@ def _current_device_index(self) -> int:
     def set_name(self, name: str):
         self._name = name
 
-    def get(self, device: Optional[PyDevice] = None) -> 'np.ndarray' | 'cp.ndarray':
+    def get(self, device: Optional[PyDevice] = None) -> "np.ndarray" | "cp.ndarray":
         if device is None:
             return self.array
         else:
diff --git a/src/python/parla/common/parray/from_data.py b/src/python/parla/common/parray/from_data.py
index 9f72bc0e..930cbb16 100644
--- a/src/python/parla/common/parray/from_data.py
+++ b/src/python/parla/common/parray/from_data.py
@@ -176,12 +176,12 @@ def get_parray(object, count=0):  # recursively process Sequence or Dictionary
         elif isinstance(object, dict):
             accumulator = {}
             for key, value in object.items():
-                accumulator[key] = get_parray(value, count+1)
+                accumulator[key] = get_parray(value, count + 1)
             return accumulator
         elif isinstance(object, (list, tuple, set)):
             accumulator = []
             for item in object:
-                accumulator.append(get_parray(item, count+1))
+                accumulator.append(get_parray(item, count + 1))
             return type(object)(accumulator)
         else:
             raise TypeError(f"Unsupported Type: {type(object)}")