torch/csrc/autograd/profiler_kineto.cpp

#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
#include <torch/csrc/autograd/profiler_kineto.h>

#include <c10/macros/Export.h>
#include <c10/util/flat_hash_map.h>
#include <c10/util/irange.h>
#include <c10/util/overloaded.h>
#include <c10/util/variant.h>
#include <c10/util/C++17.h>

#include <torch/csrc/profiler/api.h>
#include <torch/csrc/profiler/collection.h>
#include <torch/csrc/profiler/containers.h>
#include <torch/csrc/profiler/kineto_shim.h>
#include <torch/csrc/profiler/nvtx_observer.h>

#include <ATen/Context.h>

#include <deque>
#include <limits>
#include <sstream>
#include <stdexcept>

#ifdef USE_KINETO
#include <libkineto.h>
#include <time_since_epoch.h>

#ifndef _MSC_VER
// TODO: TO be removed, once this properly works from libkineto
// Literal copy-n-paste from third_party/kineto/libkineto/src/WeakSymbols.cpp
extern "C" {
// This function is needed to avoid superfluous dependency on GNU OpenMP library
// when cuPTI is linked statically For more details see
// https://github.com/pytorch/pytorch/issues/51026
__attribute__((weak)) int acc_get_device_type() {
  throw std::runtime_error(
      "Dummy implementation of acc_get_device_type is not supposed to be called!");
}
} // extern "C"
#endif // _MSC_VER
#endif // USE_KINETO

namespace torch {
namespace autograd {
namespace profiler {

namespace {
const std::string kMemoryEventName = "[memory]";
// TODO: consider TLS (tid + tls counter)
uint64_t next_correlation_id() {
  static std::atomic<uint64_t> corr_id_{1};
  return corr_id_++;
}

inline int64_t getTimeUs() {
#ifdef USE_KINETO
  return libkineto::timeSinceEpoch(std::chrono::system_clock::now());
#else
  return torch::profiler::impl::getTime() / 1000;
#endif // USE_KINETO
}
} // namespace

namespace python_tracer {
namespace {
CallFn call_fn;
TraceEventsFn get_events_fn;
} // namespace

void registerFunctions(CallFn call, TraceEventsFn get_events) {
  call_fn = call;
  get_events_fn = get_events;
}

void call(Command c) {
  if (call_fn != nullptr) {
    call_fn(c);
  }
}

std::vector<std::unique_ptr<PyTraceEvent>> get_events() {
  return get_events_fn != nullptr
      ? get_events_fn()
      : std::vector<std::unique_ptr<PyTraceEvent>>();
}

// We do not want `getTimeUs` to be directly visible, but we need a way for
// the python tracer to use the same timing convention as the profiler.
int64_t now() {
  return getTimeUs();
}

struct Replay {
  PyTraceEvent* frame_;
  bool enter_;

  C10_NODISCARD int64_t t() const {
    return enter_ ? frame_->startTime_ : frame_->endTime_;
  }

  C10_NODISCARD size_t idx() const {
    return enter_ ? frame_->call_idx_ : frame_->return_idx_;
  }

  bool operator<(const Replay& other) const {
    return idx() < other.idx();
  }
};

void _push_reverse_order(PyTraceEvent* e, std::vector<std::string>& names) {
  if (e != nullptr) {
    _push_reverse_order(e->parent_, names);
    names.push_back(e->name_);
  }
}
} // namespace python_tracer

namespace {
using torch::profiler::impl::ProfilerThreadLocalStateBase;
using torch::profiler::impl::ActiveProfilerType;
using torch::profiler::impl::Result;
using torch::profiler::impl::kineto::annotation_t;
using torch::profiler::impl::shapesToStr;
using torch::profiler::impl::dtypesToStr;
using torch::profiler::impl::stacksToStr;

struct MemoryEventData {
  torch::profiler::impl::approx_time_t start_time;
  void* ptr;
  int64_t alloc_size;
  int64_t total_allocated;
  int64_t total_reserved;
  uint64_t threadID;
  torch::profiler::impl::kineto::DeviceAndResource kineto_info;
  c10::DeviceType device_type;
  c10::DeviceIndex device_index;
};
static_assert(std::is_pod<MemoryEventData>::value, "Non-POD member of MemoryEventData.");

struct EventFieldsVisitor {
  EventFieldsVisitor(const Result& result, KinetoEvent& kineto_event)
      : result_{result}, kineto_event_{kineto_event} {
    handleJIT(result_.get().jit_stack_, result_.get().jit_modules_);
    c10::visit(*this, result.event_);
  }

  void operator()(const torch::profiler::impl::OpEvent& op_event) {
    kineto_event_.get()
        .endThreadId(op_event.end_thread_id_)
        .scope(op_event.record_function_scope_)
        .setAsync(op_event.is_async_)
        .debugHandle(op_event.debug_handle_);

    auto& shapes = result_.get().inputs_.shapes_;
    if (!shapes.empty()) {
      kineto_event_.get().shapes(shapes);
      annotations_.emplace_back("Input Dims", shapesToStr(shapes));
    }

    auto& dtypes = result_.get().inputs_.dtypes_;
    if (!dtypes.empty()) {
      kineto_event_.get().dtypes(dtypes);
      annotations_.emplace_back("Input type", dtypesToStr(dtypes));
    }

    if (!result_.get().extra_args_.empty()) {
      kineto_event_.get().flops(
          computeFlops(result_.get().name(), result_.get().extra_args_));
    }
    kineto_event_.get().cuda_event_start_ =
        result_.get().gpu_fallback_.cuda_event_start_;
    kineto_event_.get().cuda_event_end_ =
        result_.get().gpu_fallback_.cuda_event_end_;

    // add information about an associated forward op, if a sequence number
    // is available (e.g. during training)
    if (op_event.sequence_number_ >= 0) {
      kineto_event_.get()
          .sequenceNr(op_event.sequence_number_)
          .fwdThreadId(op_event.forward_thread_id_);
      annotations_.emplace_back(
          "Fwd thread id", std::to_string(op_event.forward_thread_id_));
      annotations_.emplace_back(
          "Sequence number", std::to_string(op_event.sequence_number_));
    }
  }

  void operator()(const torch::profiler::impl::BackendEvent& backend_event) {
    kineto_event_.get()
        .endThreadId(result_.get().start_tid_)
        .scope(backend_event.record_function_scope_)
        .debugHandle(backend_event.debug_handle_)
        .backend(backend_event.backend_);

    if (!backend_event.backend_.empty()) {
      annotations_.emplace_back(
          "Backend", "\"" + backend_event.backend_ + "\"");
    }
  }

  void handleJIT(
      const std::vector<std::string>& jit_stack,
      const std::vector<std::string>& jit_modules) {
    if (!jit_stack.empty()) {
      // NB: This is only for the JIT stack. The python stack (if applicable)
      //     is constructed later.
      kineto_event_.get().stack(jit_stack);
      annotations_.emplace_back(
          "Call stack", torch::profiler::impl::stacksToStr(jit_stack, ";"));
    }

    if (!jit_modules.empty()) {
      kineto_event_.get().moduleHierarchy(jit_modules);
      annotations_.emplace_back(
          "Module Hierarchy",
          torch::profiler::impl::stacksToStr(jit_modules, "."));
    }
  }

  std::reference_wrapper<const Result> result_;
  std::reference_wrapper<KinetoEvent> kineto_event_;
  annotation_t annotations_;
};

auto getAnnotations(const MemoryEventData& event) {
  torch::profiler::impl::kineto::annotation_t out{
      {"Device Type", std::to_string((int8_t)event.device_type)},
      {"Device Id", std::to_string(event.device_index)},
      {"Addr", std::to_string(reinterpret_cast<intptr_t>(event.ptr))},
      {"Bytes", std::to_string(event.alloc_size)}};

  if (event.total_allocated >= 0) {
    out.emplace_back("Total Allocated", std::to_string(event.total_allocated));
  }
  if (event.total_reserved >= 0) {
    out.emplace_back("Total Reserved", std::to_string(event.total_reserved));
  }
  return out;
}

// Assumption: Total threads number will not exceed 2^16-1, and total ops will
// not exceed 2^48 -1.
static inline uint64_t getForwardThreadKey(uint64_t tid, uint64_t seqNr) {
  return (((tid) << 48) | ((seqNr) & (((uint64_t)1 << 48) - 1)));
}

struct KinetoThreadLocalState : public ProfilerThreadLocalStateBase {
  explicit KinetoThreadLocalState(
      const ProfilerConfig& config,
      std::set<torch::profiler::impl::ActivityType> activities)
      : ProfilerThreadLocalStateBase(config),
        start_time_(getTimeUs()),
        activities_(std::move(activities)),
        record_queue_(config),
        cpu_trace_(start_time_, "PyTorch Profiler") {}
  ~KinetoThreadLocalState() override = default;

  static KinetoThreadLocalState* getTLS() {
    auto tls = ProfilerThreadLocalStateBase::getTLS();
    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
        tls == nullptr || tls->profilerType() == ActiveProfilerType::KINETO);
    return static_cast<KinetoThreadLocalState*>(tls);
  }

  ActiveProfilerType profilerType() override {
    return ActiveProfilerType::KINETO;
  }

  bool tracePython() {
    return config().with_stack && activities_.count(ActivityType::CPU);
  }

  void reportMemoryUsage(
      void* ptr,
      int64_t alloc_size,
      int64_t total_allocated,
      int64_t total_reserved,
      c10::Device device) override {
    if (config_.profile_memory && config_.state != ProfilerState::Disabled) {
      std::lock_guard<std::mutex> guard(state_mutex_);
      memory_events_.emplace_back(
          torch::profiler::impl::getApproximateTime(),
          ptr,
          alloc_size,
          total_allocated,
          total_reserved,
          at::RecordFunction::currentThreadId(),
          torch::profiler::impl::kineto::kineto_ids(),
          device.type(),
          device.index());
    }
  }

  const post_process_t& getEventPostProcessingCallback() const {
    return event_post_process_cb_;
  }

  void setEventPostProcessingCallback(post_process_t&& cb) {
    event_post_process_cb_ = std::move(cb);
  }

  torch::profiler::impl::kineto::ActivityTraceWrapper finalizeTrace() {
    auto end_time = getTimeUs();
    materializeOpEvents();

    finalizeCPUTrace(cpu_trace_.get());
    {
      std::lock_guard<std::mutex> guard(state_mutex_);
      cpu_trace_.transferCpuTrace(end_time);
    }

    if (config().state != ProfilerState::KINETO_ONDEMAND) {
      auto trace = torch::profiler::impl::kineto::stopTrace();
      TORCH_CHECK(trace || !torch::profiler::kKinetoAvailable);
      addTraceEvents(trace);
      return trace;
    } else {
      return torch::profiler::impl::kineto::ActivityTraceWrapper();
    }
  }

  void materializeOpEvents() {
    std::lock_guard<std::mutex> guard(state_mutex_);
    auto converter = clock_converter_.makeConverter();

    for (const auto& e : memory_events_) {
      auto start_time_us = converter(e.start_time) / 1000;
      cpu_trace_.addCPUActivity(
          kMemoryEventName,
          torch::profiler::impl::kineto::KinetoActivityType::CPU_INSTANT_EVENT,
          e.kineto_info,
          /*correlation_id=*/0,
          start_time_us,
          start_time_us,
          getAnnotations(e));

      kineto_events_.emplace_back();
      auto& evt = kineto_events_.back();
      evt.name(kMemoryEventName)
          .startUs(start_time_us)
          .deviceIndex(e.device_index)
          .deviceType(e.device_type)
          .nBytes(e.alloc_size)
          .startThreadId(e.threadID);
    }
    memory_events_.clear();

    for (auto& e : record_queue_.getRecords(converter)) {
      // `take_data` handles time conversion.
      int64_t start_us = e.start_time_us_;
      int64_t end_us = e.end_time_us_;

      if (end_us < start_us) {
        // We initialize end_us_ to the smallest int64_t, so this means that
        // the op did not finish before we stopped profiling.
        continue;
      }

      // Call events post processing callback before finalizing trace, if there
      // is one.
      if (getEventPostProcessingCallback()) {
        getEventPostProcessingCallback()(
            c10::visit([](const auto& i) { return i.debug_handle_; }, e.event_),
            e.jit_stack_,
            e.jit_modules_);
      }

      kineto_events_.emplace_back();
      kineto_events_.back()
          .name(e.name())
          .startUs(start_us)
          .durationUs(end_us - start_us)
          .correlationId(e.correlation_id())
          .deviceType(c10::DeviceType::CPU)
          .startThreadId(e.start_tid_);

      // NB: also sets fields on `kineto_events_.back()`.
      auto annotations =
          EventFieldsVisitor(e, kineto_events_.back()).annotations_;

      cpu_trace_.addCPUActivity(
          e.name(),
          e.kinetoType(),
          e.kineto_info_,
          e.correlation_id(),
          start_us,
          end_us,
          annotations);
    }
  }

  void finalizeCPUTrace(std::unique_ptr<torch::profiler::impl::kineto::trace_t>& cpu_trace) {
#ifndef USE_KINETO
  }
#else // USE_KINETO
    TORCH_INTERNAL_ASSERT(
        cpu_trace->activities.size() == kineto_events_.size());
    // startThreadId_seqNum to pointer of activity.
    // Low-16bits of startThreadId and low-48bits seqNum are concatenated into
    // one uint64_t variable as key.

    // From the time being, we need disable the forward/backward correlation feature to
    // workaround the crash bug.
    // TODO: by Mike Guo
    // reenable the forward/backward correlation when kineto fix the following raw pointer
    //    GenericTraceActivity.flow.linkedActivity

    /*
    std::unordered_map<uint64_t, libkineto::GenericTraceActivity*>
        tidSeq2activity;

    for (const auto idx : c10::irange(cpu_trace->activities.size())) {
      auto& kineto_event = kineto_events_[idx];
      auto& activity = cpu_trace->activities[idx];

      // add information about an associated forward op, if a sequence number
      // is available (e.g. during training)
      if (kineto_event.sequenceNr() >= 0) {
        generateForwardBackwardLink(
            kineto_event, fwd_bwd_link_id, activity, tidSeq2activity);
      }
    }
    */

    addPythonEvents(cpu_trace);
  }

  void addPythonEvents(std::unique_ptr<torch::profiler::impl::kineto::trace_t>& cpu_trace) {
    if (!tracePython()) {
      return;
    }

    auto py_events = python_tracer::get_events();
    for (const auto& e : py_events) {
      TORCH_INTERNAL_ASSERT(
          !e->thread_id_,
          "Profiler expects only single threaded Python tracing.");
    }

    // The remainder of this function merges the Python and Kineto event
    // streams into a single stream. If Python tracing is not enabled, we want
    // to avoid this process altogether to cut down on processing time.
    if (!py_events.size()) {
      return;
    }

    // Kineto event times
    std::vector<int64_t> op_start_times;
    for (const auto& a : cpu_trace->activities) {
      op_start_times.push_back(a.startTime);
    }
    std::sort(op_start_times.begin(), op_start_times.end());

    // Map PyTraceEvent* to sequential integers for JSON export.
    ska::flat_hash_map<python_tracer::PyTraceEvent*, std::string>
        py_event_indices_{
            { nullptr,
              std::string("null") }};
    for (const auto i : c10::irange(py_events.size())) {
      py_event_indices_.insert({py_events[i].get(), std::to_string(i)});
    }

    ska::flat_hash_map<std::string, size_t> module_counter_;
    ska::flat_hash_map<size_t, std::string> module_id_map_;
    auto record_module_id = [&](python_tracer::PyTraceEvent* e) {
      if (e->call_type_ == python_tracer::CallType::kPyModuleCall &&
          module_id_map_.find(e->module_id_) == module_id_map_.end()) {
        // We use the fact that operator[] will default initialize new keys.
        module_id_map_[e->module_id_] =
            std::to_string(module_counter_[e->name_]++);
      }
    };

    // Python events
    std::vector<python_tracer::Replay> py_replay;
    for (const auto& e : py_events) {
      py_replay.push_back({e.get(), true});
      py_replay.push_back({e.get(), false});
    }
    std::sort(py_replay.begin(), py_replay.end());

    // In order to determine the state of the python interpreter when a
    // particular op is called, we have to replay the python events and note
    // timestamps which are associated with op start times.
    std::vector<python_tracer::PyTraceEvent*> py_stack;
    ska::flat_hash_map<int64_t, python_tracer::PyTraceEvent*> op_py_map;
    auto replay_it = py_replay.begin();
    for (auto t : op_start_times) {
      while (replay_it != py_replay.end() && replay_it->t() <= t) {
        if (replay_it->enter_) {
          py_stack.push_back(replay_it->frame_);
          record_module_id(replay_it->frame_);
        } else {
          TORCH_INTERNAL_ASSERT(py_stack.size());
          TORCH_INTERNAL_ASSERT(py_stack.back() == replay_it->frame_);
          py_stack.pop_back();
        }
        replay_it++;
      }
      op_py_map.insert({t, py_stack.size() ? py_stack.back() : nullptr});
    }

    std::vector<libkineto::GenericTraceActivity> py_activities;
    auto py_events_it = py_events.begin();
    auto py_device = libkineto::processId();
    auto main_thread = libkineto::systemThreadId();
    auto push_py_event = [&]() {
      auto e = (*py_events_it).get();
      libkineto::GenericTraceActivity op(
          cpu_trace->span, libkineto::ActivityType::PYTHON_FUNCTION, e->name_);

      op.device = py_device;
      op.resource = main_thread;
      op.startTime = e->startTime_;
      op.endTime = e->endTime_;

      op.addMetadata("Python id", py_event_indices_.at(e));
      op.addMetadata("Python parent id", py_event_indices_.at(e->parent_));
      op.addMetadata("Python thread", std::to_string(e->thread_id_));
      if (e->call_type_ == python_tracer::CallType::kPyModuleCall) {
        op.addMetadata("Python module id", module_id_map_.at(e->module_id_));
      }

      py_activities.push_back(op);
      py_events_it++;
    };

    TORCH_INTERNAL_ASSERT(cpu_trace->activities.size() == kineto_events_.size());
    for (const auto idx : c10::irange(cpu_trace->activities.size())) {
      auto& activity = cpu_trace->activities[idx];

      // Add any python events that occurred between this Kineto event and the
      // previous Kineto event.
      while (py_events_it != py_events.end() &&
             (*py_events_it)->endTime_ <= activity.endTime) {
        push_py_event();
      }

      auto python_caller = op_py_map.at(activity.startTime);
      activity.addMetadata(
          "python_caller_id", py_event_indices_.at(python_caller));

      // If the kineto event has a stack that means the JIT model has a stack
      // associated with it that we need to respect.
      if (!kineto_events_[idx].hasStack()) {
        std::vector<std::string> py_names;
        _push_reverse_order(python_caller, py_names);
        kineto_events_[idx].stack(py_names);
        activity.addMetadata("Call stack", torch::profiler::impl::stacksToStr(py_names, ";"));
      }
    }

    // Add any Python events which finish after the last Kineto event.
    while (py_events_it != py_events.end()) {
      push_py_event();
    }

    cpu_trace->activities.insert(cpu_trace->activities.end(), py_activities.begin(), py_activities.end());
  }

  void generateForwardBackwardLink(
      const KinetoEvent& kineto_event,
      uint64_t& fwd_bwd_link_id,
      libkineto::GenericTraceActivity& activity,
      std::unordered_map<uint64_t, libkineto::GenericTraceActivity*>&
          tidSeq2activity) {
    if (kineto_event.fwdThreadId() > 0) {
      // act is backward op.
      uint64_t key = getForwardThreadKey(
          kineto_event.fwdThreadId(), kineto_event.sequenceNr());
      auto iter = tidSeq2activity.find(key);
      if (iter != tidSeq2activity.end()) {
        libkineto::GenericTraceActivity* fwd = iter->second;
        fwd->flow.start = true;
        activity.flow.id = fwd->flow.id = fwd_bwd_link_id;
        activity.flow.type = fwd->flow.type = libkineto::kLinkFwdBwd;
        ++fwd_bwd_link_id;
      }
    } else if (kineto_event.startThreadId() != 0) {
      // act is forward op.
      uint64_t key = getForwardThreadKey(
          kineto_event.startThreadId(), kineto_event.sequenceNr());
      // Assumption: Among all ops with same sequence number,
      // the one with biggest start time is most likely launching backward op.
      auto iter = tidSeq2activity.find(key);
      if (iter == tidSeq2activity.end()) {
        tidSeq2activity[key] = &activity;
      } else {
        // Now the sequence number is only incremented on creating a "Node"
        // object for backward pass, by calling
        // "at::sequence_number::get_and_increment()". Among all ops with same
        // sequence number, the one with biggest startTime is the one launching
        // backward op.
        if (activity.startTime >= iter->second->startTime) {
          tidSeq2activity[key] = &activity;
        }
      }
    }
  }
#endif // USE_KINETO

  void addTraceEvents(torch::profiler::impl::kineto::ActivityTraceWrapper& trace) {
#ifdef USE_KINETO
    const auto& events = *(trace.get()->activities());
    for (const auto& ev_ptr : events) {
      if (ev_ptr == nullptr) {
        continue;
      }
      const auto& activity = *ev_ptr;
      // These events are already processed
      if (activity.type() != libkineto::ActivityType::CPU_OP &&
          activity.type() != libkineto::ActivityType::CPU_INSTANT_EVENT &&
          activity.type() != libkineto::ActivityType::USER_ANNOTATION &&
          activity.type() != libkineto::ActivityType::PYTHON_FUNCTION) {
        kineto_events_.emplace_back();
        auto& kineto_event = kineto_events_.back();
        kineto_event.name(activity.name())
            .deviceIndex(activity.deviceId())
            .deviceResourceId(activity.resourceId())
            .startUs(activity.timestamp())
            .durationUs(activity.duration())
            .activityType((uint8_t)activity.type());
        if (activity.linkedActivity()) {
          kineto_event.linkedCorrelationId(
              activity.linkedActivity()->correlationId());
        }
        kineto_event.deviceType(deviceTypeFromActivity(activity.type()));
      }
    }
#endif // USE_KINETO
  }

  uint64_t start_time_;
  torch::profiler::impl::ApproximateClockToUnixTimeConverter clock_converter_;
  std::set<torch::profiler::impl::ActivityType> activities_;
  torch::profiler::impl::RecordQueue record_queue_;
  torch::profiler::impl::AppendOnlyList<MemoryEventData, 1024> memory_events_;
  torch::profiler::impl::kineto::TraceWrapper cpu_trace_;
  std::vector<KinetoEvent> kineto_events_;
  // Optional, if event post-processing is enabled.
  post_process_t event_post_process_cb_;
};

static std::unique_ptr<KinetoThreadLocalState> globalStatePtr;

template<typename... Args>
static void initGlobalState(Args... args) {
  if (globalStatePtr) {
    LOG(WARNING) << "GlobalStatePtr already exists!";
  } else {
    globalStatePtr = std::make_unique<KinetoThreadLocalState>(std::forward<Args>(args)...);
  }
}

static void resetGlobalState() {
  TORCH_INTERNAL_ASSERT(globalStatePtr != nullptr, "Global state ptr cannot be null before resetting");
  globalStatePtr.reset();
}

template<bool use_global>
static KinetoThreadLocalState* getStatePtr() {
  return c10::guts::if_constexpr<use_global>(
      [] { return globalStatePtr.get(); },
      [] { return KinetoThreadLocalState::getTLS(); });
}

template<bool use_global_state_ptr = false>
std::unique_ptr<at::ObserverContext> onFunctionEnter(const at::RecordFunction& fn) {
  auto state_ptr = getStatePtr<use_global_state_ptr>();
  if (!state_ptr) {
    return nullptr;
  }
  auto corr_id = next_correlation_id();
  if (fn.scope() == at::RecordScope::USER_SCOPE) {
    torch::profiler::impl::kineto::pushUserCorrelationId(corr_id);
  } else {
    torch::profiler::impl::kineto::pushCorrelationId(corr_id);
  }
  return state_ptr->record_queue_.getSubqueue()->begin_op(fn, corr_id);
}

// @lint-ignore CLANGTIDY clang-diagnostic-unused-parameter
template<bool use_global_state_ptr = false>
void onFunctionExit(const at::RecordFunction& fn, at::ObserverContext* ctx_ptr) {
  auto state_ptr = getStatePtr<use_global_state_ptr>();
  if (!state_ptr) {
    return;
  }
  const auto& config = state_ptr->config();
  auto* kineto_ctx_ptr =
    static_cast<torch::profiler::impl::KinetoObserverContext*>(ctx_ptr);
  TORCH_INTERNAL_ASSERT(kineto_ctx_ptr != nullptr);
  kineto_ctx_ptr->event_->end_time_ = torch::profiler::impl::getApproximateTime();
  kineto_ctx_ptr->event_->end_thread_id_ = at::RecordFunction::currentThreadId();
  if (config.state == ProfilerState::KINETO_GPU_FALLBACK) {
    try {
      auto fallback = kineto_ctx_ptr->fallback_;
      TORCH_INTERNAL_ASSERT(fallback != nullptr);
      torch::profiler::impl::cudaStubs()->record(
          nullptr, &fallback->cuda_event_end_, nullptr);
    } catch (const std::exception& e) {
      LOG(WARNING) << "Failed to record CUDA event. " << e.what();
    }
  }

  if (fn.scope() == at::RecordScope::USER_SCOPE) {
    torch::profiler::impl::kineto::popUserCorrelationId();
  } else {
    torch::profiler::impl::kineto::popCorrelationId();
  }
}

template <bool use_global_callback = false>
void pushProfilingCallbacks(const std::unordered_set<at::RecordScope>& scopes) {
  auto registration_state_ptr = getStatePtr<use_global_callback>();
  TORCH_INTERNAL_ASSERT(registration_state_ptr, "Expected profiler state set");
  auto recordFunctionCallback =
      at::RecordFunctionCallback(
          onFunctionEnter<use_global_callback>,
          onFunctionExit<use_global_callback>)
          .needsInputs(registration_state_ptr->config().report_input_shapes)
          .scopes(scopes);

  auto handle = c10::guts::if_constexpr<use_global_callback>(
      [&] { return at::addGlobalCallback(recordFunctionCallback); },
      [&] { return at::addThreadLocalCallback(recordFunctionCallback);
      });
  registration_state_ptr->setCallbackHandle(handle);
}

} // namespace

void reportBackendEventToActiveKinetoProfiler(
    const int64_t start_time_us,
    const int64_t end_time_us,
    const int64_t debug_handle,
    const at::RecordScope scope,
    const std::string& event_name,
    const std::string& backend_name) {
  TORCH_INTERNAL_ASSERT(globalStatePtr == nullptr, "On-demand profiling does not support post processing callback");

  auto state_ptr = KinetoThreadLocalState::getTLS();
  if (!state_ptr) {
    return;
  }

  state_ptr->record_queue_.getSubqueue()->emplace_backend_event(
    torch::profiler::impl::BackendEvent {
      start_time_us,
      end_time_us,
      (uint8_t)scope,
      debug_handle,
      event_name,
      backend_name});

  /* no support for input shapes now?
  if (config.report_input_shapes) {
    ctx_ptr->shapes = inputSizes(fn);
    ctx_ptr->dtypes = inputTypes(fn);
  }
  */
}

void prepareProfiler(
    const torch::profiler::impl::ProfilerConfig& config,
    const std::set<torch::profiler::impl::ActivityType>& activities) {
  if (config.state == ProfilerState::NVTX) {
    return;
  }
  TORCH_CHECK(
      config.state == ProfilerState::KINETO ||
          config.state == ProfilerState::KINETO_GPU_FALLBACK,
      "Supported only in Kineto profiler");
  torch::profiler::impl::kineto::prepareTrace(
      /*cpuOnly=*/!at::hasCUDA(), activities, config.experimental_config);
}

void enableProfilerWithEventPostProcess(
    const torch::profiler::impl::ProfilerConfig& config,
    const std::set<torch::profiler::impl::ActivityType>& activities,
    post_process_t&& cb,
    const std::unordered_set<at::RecordScope>& scopes) {
  TORCH_CHECK(
      config.state != ProfilerState::NVTX,
      "NVTX does not support post processing callback.");
  TORCH_INTERNAL_ASSERT(globalStatePtr == nullptr, "On-demand profiling does not support post processing callback");

  enableProfiler(config, activities, scopes);
  auto state_ptr = KinetoThreadLocalState::getTLS();
  state_ptr->setEventPostProcessingCallback(std::move(cb));
}

void enableProfiler(
    const torch::profiler::impl::ProfilerConfig& config,
    const std::set<torch::profiler::impl::ActivityType>& activities,
    const std::unordered_set<at::RecordScope>& scopes) {
  TORCH_CHECK(!profilerEnabled(), "Profiler is already enabled on this thread");
  if (config.state == ProfilerState::NVTX) {
    torch::profiler::impl::pushNVTXCallbacks(config, scopes);
    return;
  }

  TORCH_CHECK(
      config.state == ProfilerState::KINETO ||
      config.state == ProfilerState::KINETO_GPU_FALLBACK ||
      config.state == ProfilerState::KINETO_ONDEMAND);
  TORCH_CHECK(
      !activities.empty(), "No activities specified for Kineto profiler");

  if (config.state == ProfilerState::KINETO ||
      config.state == ProfilerState::KINETO_GPU_FALLBACK) {
    auto state = std::make_shared<KinetoThreadLocalState>(config, activities);
    c10::ThreadLocalDebugInfo::_push(c10::DebugInfoKind::PROFILER_STATE, state);

    if (state->tracePython()) {
      python_tracer::call(python_tracer::Command::kStartOne);
    }

    if (activities.count(ActivityType::CPU)) {
      pushProfilingCallbacks<false>(scopes);
    }
    torch::profiler::impl::kineto::startTrace();
  }

  if (config.state == ProfilerState::KINETO_ONDEMAND) {
    initGlobalState(config, activities);

    TORCH_INTERNAL_ASSERT(activities.count(ActivityType::CPU), "Ondemand profiling must enable CPU tracing");
    pushProfilingCallbacks<true>(scopes);
  }
}

std::unique_ptr<ProfilerResult> disableProfiler() {
  auto state_ptr = static_cast<ProfilerThreadLocalStateBase*>(
      (globalStatePtr == nullptr) ? getStatePtr<false>() : getStatePtr<true>());

  const auto& config = state_ptr->config();
  TORCH_CHECK(
      state_ptr &&
          (config.state == ProfilerState::KINETO ||
           config.state == ProfilerState::KINETO_GPU_FALLBACK ||
           config.state == ProfilerState::KINETO_ONDEMAND ||
           config.state == ProfilerState::NVTX),
      "Can't disable Kineto profiler when it's not running");

  if (state_ptr->hasCallbackHandle()) {
    at::removeCallback(state_ptr->callbackHandle());
  }

  // Traces are converged via libkineto automatically for ondemand flow
  if (state_ptr->config().state == ProfilerState::KINETO_ONDEMAND) {
    auto kineto_state_ptr = static_cast<KinetoThreadLocalState*>(state_ptr);
    auto trace = kineto_state_ptr->finalizeTrace();
    resetGlobalState();
    return std::make_unique<ProfilerResult>();
  }

  // Shared among NVTX, KINETO, KINETO_GPU_FALLBACK
  std::unique_ptr<ProfilerResult> result;
  if (state_ptr->config().state == ProfilerState::NVTX) {
    result = std::make_unique<ProfilerResult>();
  }

  if (config.state == ProfilerState::KINETO ||
      config.state == ProfilerState::KINETO_GPU_FALLBACK) {
    auto kineto_state_ptr = static_cast<KinetoThreadLocalState*>(state_ptr);
    if (kineto_state_ptr->tracePython()) {
      python_tracer::call(python_tracer::Command::kStop);
    }

    auto trace = kineto_state_ptr->finalizeTrace();
    if (kineto_state_ptr->tracePython()) {
      python_tracer::call(python_tracer::Command::kClear);
    }

    result = std::make_unique<ProfilerResult>(
        kineto_state_ptr->start_time_,
        std::move(kineto_state_ptr->kineto_events_),
        std::move(trace));
  }

  // Disable thread-local profiler. We can't pop until the very end as it would invalidate
  // the `state_ptr` reference which we need to process the traces.
  (void)c10::ThreadLocalDebugInfo::_pop(c10::DebugInfoKind::PROFILER_STATE);
  return result;
}

int64_t KinetoEvent::cudaElapsedUs() const {
  if (!cuda_event_start_ || !cuda_event_end_) {
    return -1;
  }
  try {
    return (int64_t)torch::profiler::impl::cudaStubs()->elapsed(&cuda_event_start_, &cuda_event_end_);
  } catch (std::exception& e) {
    LOG(WARNING) << "Failed to measure time between two CUDA events. "
                 << e.what();
  }
  return -1;
}

ProfilerResult::ProfilerResult(
    uint64_t start_time,
    std::vector<KinetoEvent> events,
    torch::profiler::impl::kineto::ActivityTraceWrapper trace)
    : trace_start_us_(start_time),
      events_(std::move(events)),
      trace_(std::move(trace)) {}
ProfilerResult::ProfilerResult() = default;
ProfilerResult::~ProfilerResult() = default;

void ProfilerResult::save(const std::string& path) {
  trace_.save(path);
}

} // namespace profiler
} // namespace autograd
} // namespace torch