diff --git a/runtime/hsa-runtime/core/driver/xdna/amd_xdna_driver.cpp b/runtime/hsa-runtime/core/driver/xdna/amd_xdna_driver.cpp index 12f140e5c..5564657a2 100644 --- a/runtime/hsa-runtime/core/driver/xdna/amd_xdna_driver.cpp +++ b/runtime/hsa-runtime/core/driver/xdna/amd_xdna_driver.cpp @@ -189,6 +189,7 @@ XdnaDriver::AllocateMemory(const core::MemoryRegion &mem_region, } vmem_handle_mappings.emplace(create_bo_args.handle, mapped_mem); + handle_size_map.emplace(create_bo_args.handle, size); vmem_handle_mappings_reverse.emplace(mapped_mem, create_bo_args.handle); return HSA_STATUS_SUCCESS; @@ -353,11 +354,18 @@ hsa_status_t XdnaDriver::InitDeviceHeap() { return HSA_STATUS_SUCCESS; } -hsa_status_t XdnaDriver::GetHandleMappings(std::unordered_map &vmem_handle_mappings) { +hsa_status_t XdnaDriver::GetHandleMappings( + std::unordered_map &vmem_handle_mappings) { vmem_handle_mappings = this->vmem_handle_mappings; return HSA_STATUS_SUCCESS; } +hsa_status_t XdnaDriver::GetHandleSizeMap( + std::unordered_map &handle_size_map) { + handle_size_map = this->handle_size_map; + return HSA_STATUS_SUCCESS; +} + hsa_status_t XdnaDriver::GetFd(int &fd) { fd = fd_; return HSA_STATUS_SUCCESS; diff --git a/runtime/hsa-runtime/core/inc/amd_aie_aql_queue.h b/runtime/hsa-runtime/core/inc/amd_aie_aql_queue.h index 224b85d7c..f47a93cea 100644 --- a/runtime/hsa-runtime/core/inc/amd_aie_aql_queue.h +++ b/runtime/hsa-runtime/core/inc/amd_aie_aql_queue.h @@ -165,7 +165,8 @@ class AieAqlQueue : public core::Queue, static hsa_status_t SubmitCmd( uint32_t hw_ctx_handle, int fd, void *queue_base, uint64_t read_dispatch_id, uint64_t write_dispatch_id, - std::unordered_map &vmem_handle_mappings); + std::unordered_map &vmem_handle_mappings, + std::unordered_map &handle_size_map); /// @brief Creates a command BO and returns a pointer to the memory and // the corresponding handle @@ -190,7 +191,7 @@ class AieAqlQueue : public core::Queue, /// @brief Syncs all BOs referenced in bo_args /// /// @param bo_args vector containing handles of BOs to sync - static hsa_status_t SyncBos(std::vector &bo_args, int fd); + static hsa_status_t SyncBo(int fd, uint32_t bo_arg, uint32_t direction, uint32_t size); /// @brief Executes a command and waits for its completion /// diff --git a/runtime/hsa-runtime/core/inc/amd_xdna_driver.h b/runtime/hsa-runtime/core/inc/amd_xdna_driver.h index 79cbaa710..0d86ad3dc 100644 --- a/runtime/hsa-runtime/core/inc/amd_xdna_driver.h +++ b/runtime/hsa-runtime/core/inc/amd_xdna_driver.h @@ -71,6 +71,7 @@ class XdnaDriver : public core::Driver { hsa_status_t QueryKernelModeDriver(core::DriverQuery query) override; hsa_status_t GetHandleMappings(std::unordered_map &vmem_handle_mappings); + hsa_status_t GetHandleSizeMap(std::unordered_map &handle_size_map); hsa_status_t GetFd(int &fd); hsa_status_t GetAgentProperties(core::Agent &agent) const override; @@ -118,6 +119,7 @@ class XdnaDriver : public core::Driver { // TODO: Remove this once we move to the vmem API std::unordered_map vmem_handle_mappings_reverse; + std::unordered_map handle_size_map; /// @brief Virtual address range allocated for the device heap. /// @@ -128,7 +130,7 @@ class XdnaDriver : public core::Driver { /// @brief The aligned device heap. void *dev_heap_aligned = nullptr; - static constexpr size_t dev_heap_size = 48 * 1024 * 1024; + static constexpr size_t dev_heap_size = 64 * 1024 * 1024; static constexpr size_t dev_heap_align = 64 * 1024 * 1024; }; diff --git a/runtime/hsa-runtime/core/runtime/amd_aie_aql_queue.cpp b/runtime/hsa-runtime/core/runtime/amd_aie_aql_queue.cpp index 6f796441a..5639a02b3 100644 --- a/runtime/hsa-runtime/core/runtime/amd_aie_aql_queue.cpp +++ b/runtime/hsa-runtime/core/runtime/amd_aie_aql_queue.cpp @@ -82,7 +82,7 @@ constexpr int CMD_PKT_PAYLOAD_INSTRUCTION_SEQUENCE_IDX = 2; // Environment variable to define job submission timeout constexpr const char *TIMEOUT_ENV_VAR = "ROCR_AIE_TIMEOUT"; -constexpr int DEFAULT_TIMEOUT_VAL = 50; +constexpr int DEFAULT_TIMEOUT_VAL = 0; char *timeout_env_var_ptr = getenv(TIMEOUT_ENV_VAR); int timeout_val = timeout_env_var_ptr == nullptr ? DEFAULT_TIMEOUT_VAL : atoi(timeout_env_var_ptr); @@ -219,12 +219,16 @@ uint64_t AieAqlQueue::AddWriteIndexAcqRel(uint64_t value) { void AieAqlQueue::StoreRelaxed(hsa_signal_value_t value) { std::unordered_map vmem_handle_mappings; + std::unordered_map handle_size_map; auto &driver = static_cast( core::Runtime::runtime_singleton_->AgentDriver(agent_.driver_type)); if (driver.GetHandleMappings(vmem_handle_mappings) != HSA_STATUS_SUCCESS) { return; } + if (driver.GetHandleSizeMap(handle_size_map) != HSA_STATUS_SUCCESS) { + return; + } int fd = 0; if (driver.GetFd(fd) != HSA_STATUS_SUCCESS) { @@ -233,17 +237,17 @@ void AieAqlQueue::StoreRelaxed(hsa_signal_value_t value) { SubmitCmd(hw_ctx_handle_, fd, amd_queue_.hsa_queue.base_address, amd_queue_.read_dispatch_id, amd_queue_.write_dispatch_id, - vmem_handle_mappings); + vmem_handle_mappings, handle_size_map); } -hsa_status_t AieAqlQueue::SyncBos(std::vector &bo_args, int fd) { - for (unsigned int bo_arg : bo_args) { - amdxdna_drm_sync_bo sync_params = {}; - sync_params.handle = bo_arg; - if (ioctl(fd, DRM_IOCTL_AMDXDNA_SYNC_BO, &sync_params)) - return HSA_STATUS_ERROR; +hsa_status_t AieAqlQueue::SyncBo(int fd, uint32_t bo_arg, uint32_t direction, uint32_t size) { + amdxdna_drm_sync_bo sync_params = {}; + sync_params.handle = bo_arg; + sync_params.direction = direction; + sync_params.size = size; + if (ioctl(fd, DRM_IOCTL_AMDXDNA_SYNC_BO, &sync_params)) { + return HSA_STATUS_ERROR; } - return HSA_STATUS_SUCCESS; } @@ -330,7 +334,8 @@ hsa_status_t AieAqlQueue::CreateCmd(uint32_t size, uint32_t *handle, hsa_status_t AieAqlQueue::SubmitCmd( uint32_t hw_ctx_handle, int fd, void *queue_base, uint64_t read_dispatch_id, uint64_t write_dispatch_id, - std::unordered_map &vmem_handle_mappings) { + std::unordered_map &vmem_handle_mappings, + std::unordered_map &handle_size_map) { uint64_t cur_id = read_dispatch_id; while (cur_id < write_dispatch_id) { hsa_amd_aie_ert_packet_t *pkt = @@ -351,9 +356,6 @@ hsa_status_t AieAqlQueue::SubmitCmd( // packets there are. All can be combined into a single chain. int num_cont_start_cu_pkts = 1; for (int peak_pkt_id = cur_id + 1; peak_pkt_id < write_dispatch_id; peak_pkt_id++) { - if (pkt->opcode != HSA_AMD_AIE_ERT_START_CU) { - break; - } num_cont_start_cu_pkts++; } @@ -416,8 +418,10 @@ hsa_status_t AieAqlQueue::SubmitCmd( } // Syncing BOs before we execute the command - if (SyncBos(bo_args, fd)) - return HSA_STATUS_ERROR; + for (auto bo_arg : bo_args) { + if (SyncBo(fd, bo_arg, SYNC_DIRECT_TO_DEVICE, handle_size_map[bo_arg])) + return HSA_STATUS_ERROR; + } // Removing duplicates in the bo container. The driver will report // an error if we provide the same BO handle multiple times. @@ -440,8 +444,12 @@ hsa_status_t AieAqlQueue::SubmitCmd( ExecCmdAndWait(&exec_cmd_0, hw_ctx_handle, fd); // Syncing BOs after we execute the command - if (SyncBos(bo_args, fd)) - return HSA_STATUS_ERROR; + for (auto bo_arg : bo_args) { + if (SyncBo(fd, bo_arg, SYNC_DIRECT_FROM_DEVICE, + handle_size_map[bo_arg])) { + return HSA_STATUS_ERROR; + } + } cur_id += num_cont_start_cu_pkts; break;