Skip to content

Commit

Permalink
Merge pull request #44 from beclab/feat/reconstruct-train-rank
Browse files Browse the repository at this point in the history
feat: for rank train to add short term user  embedding
  • Loading branch information
bleachzou3 authored Dec 17, 2024
2 parents fc00c9f + 9082efc commit 4360785
Show file tree
Hide file tree
Showing 6 changed files with 271 additions and 63 deletions.
9 changes: 9 additions & 0 deletions train-rank/infra/ubuntu_develop/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -91,5 +91,14 @@ RUN apt update && \
apt install python3-pip -y && \
pip install cpplint

ENV EIGEN3_INCLUDE_DIR=/usr/local/include/eigen3
RUN cd /opt && \
curl -LJO https://gitlab.com/libeigen/eigen/-/archive/3.4.0/eigen-3.4.0.tar.gz \
&& tar -xzf eigen-3.4.0.tar.gz \
&& cd eigen-3.4.0 && \
mkdir build && \
cd build && \
cmake .. && \
make install

ENTRYPOINT ["tail", "-f", "/dev/null"]
113 changes: 94 additions & 19 deletions train-rank/src/common_tool.cpp
Original file line number Diff line number Diff line change
@@ -1,14 +1,17 @@
#include <chrono>
#include <iostream>
#include <vector>
#include <eigen3/Eigen/Dense>
using namespace Eigen;

// using namespace std;

#include "easylogging++.h"

INITIALIZE_EASYLOGGINGPP

void init_log() {
void init_log()
{
el::Configurations defaultConf;
defaultConf.setToDefault();
// Values are always std::string
Expand All @@ -20,23 +23,29 @@ void init_log() {
// To set GLOBAL configurations you may use
}

void printVector(const std::vector<std::string>& a) {
void printVector(const std::vector<std::string> &a)
{
std::cout << "The vector elements are : ";

for (int i = 0; i < a.size(); i++) std::cout << a.at(i) << ' ';
for (int i = 0; i < a.size(); i++)
std::cout << a.at(i) << ' ';
}

bool isStringEmptyOrWhitespace(const std::string& str) {
bool isStringEmptyOrWhitespace(const std::string &str)
{
// Check if the string is empty
if (str.empty()) {
if (str.empty())
{
return true;
}
// Check if all characters in the string are whitespace
return std::all_of(str.begin(), str.end(),
[](unsigned char ch) { return std::isspace(ch); });
[](unsigned char ch)
{ return std::isspace(ch); });
}

int countStringToken(const std::string& content) {
int countStringToken(const std::string &content)
{
std::vector<std::string> tokens;

// stringstream class check1
Expand All @@ -45,38 +54,104 @@ int countStringToken(const std::string& content) {
std::string intermediate;

// Tokenizing w.r.t. space ' '
while (getline(check1, intermediate, ' ')) {
if (isStringEmptyOrWhitespace(intermediate)) {
while (getline(check1, intermediate, ' '))
{
if (isStringEmptyOrWhitespace(intermediate))
{
continue;
}
tokens.push_back(intermediate);
}
return tokens.size();
}

bool isConvertibleToInt(const std::string& str) {
bool isConvertibleToInt(const std::string &str)
{
bool result = false;
try {
try
{
std::stoi(str);
result = true;
} catch (...) {
}
catch (...)
{
}
return result;
}

std::time_t getTimeStampNow() {
std::time_t getTimeStampNow()
{
std::chrono::time_point<std::chrono::system_clock, std::chrono::seconds> tp =
std::chrono::time_point_cast<std::chrono::seconds>(
std::chrono::system_clock::now());
std::time_t timestamp = tp.time_since_epoch().count();
return timestamp;
}

std::string envOrBlank(const char* env) {
auto envvar = std::getenv(env);
if (envvar == nullptr) {
return "";
} else {
return std::string(envvar);
std::string envOrBlank(const char *env)
{
auto envvar = std::getenv(env);
if (envvar == nullptr)
{
return "";
}
else
{
return std::string(envvar);
}
}

int getEnvInt(const char *envVar, int defaultValue)
{
// Get the environment variable
const char *envValue = std::getenv(envVar);

// If the environment variable does not exist, return the default value
if (envValue == nullptr)
{
return defaultValue;
}

// Try to convert the environment variable value to an integer
try
{
return std::stoi(envValue); // Use std::stoi to convert the string to an integer
}
catch (const std::invalid_argument &e)
{
// If the conversion fails (e.g., the environment variable value cannot be converted to an integer), return the default value
std::cerr << "Error: Invalid integer in environment variable '" << envVar << "'." << std::endl;
}
catch (const std::out_of_range &e)
{
// If the integer overflows, return the default value
std::cerr << "Error: Integer overflow in environment variable '" << envVar << "'." << std::endl;
}

// If an exception occurs, return the default value
return defaultValue;
}

void calculate_embedding()
{
int n = 1000; // 假设我们有 1000 个 embedding,每个是大小为 128 的 vector
int embedding_dim = 128;

// 创建一个 Eigen 矩阵来存储这些向量
MatrixXf embeddings(n, embedding_dim);

// 填充这些向量
for (int i = 0; i < n; ++i)
{
for (int j = 0; j < embedding_dim; ++j)
{
embeddings(i, j) = static_cast<float>(i + j);
}
}

// 将 n 个向量相加
VectorXf result = embeddings.colwise().sum();

// 输出结果
std::cout << "Sum of embeddings: " << result.transpose() << std::endl;
}
103 changes: 60 additions & 43 deletions train-rank/src/common_tool.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,9 @@
#include <numeric>
#include <string>
#include <vector>


using namespace std::chrono;

//DECLARE_string(model_path_root);
// DECLARE_string(model_path_root);

#include "easylogging++.h"
// using namespace std;
Expand All @@ -26,58 +24,67 @@ static char TERMINUS_RECOMMEND_EMBEDDING_DIMENSION[] =

void init_log();

void printVector(const std::vector<std::string>& a);
void printVector(const std::vector<std::string> &a);

bool isStringEmptyOrWhitespace(const std::string &str);

bool isStringEmptyOrWhitespace(const std::string& str);
int countStringToken(const std::string &content);

int countStringToken(const std::string& content);
bool isConvertibleToInt(const std::string &str);

bool isConvertibleToInt(const std::string& str);
std::string envOrBlank(const char *env);

std::string envOrBlank(const char* env);
int getEnvInt(const char *envVar, int defaultValue);

template <class T1, class T2>
double AUROC(const T1 label[], const T2 score[], int n) {
double AUROC(const T1 label[], const T2 score[], int n)
{
for (int i = 0; i < n; i++)
if (!std::isfinite(score[i]) || label[i] != 0 && label[i] != 1)
return std::numeric_limits<double>::signaling_NaN();

const auto order = new int[n];
std::iota(order, order + n, 0);
std::sort(order, order + n,
[&](int a, int b) { return score[a] > score[b]; });
[&](int a, int b)
{ return score[a] > score[b]; });
const auto y = new double[n];
const auto z = new double[n];
for (int i = 0; i < n; i++) {
for (int i = 0; i < n; i++)
{
y[i] = label[order[i]];
z[i] = score[order[i]];
}

const auto tp = y; // Reuse
const auto tp = y; // Reuse
std::partial_sum(y, y + n, tp);

int top = 0; // # diff
int top = 0; // # diff
for (int i = 0; i < n - 1; i++)
if (z[i] != z[i + 1]) order[top++] = i;
if (z[i] != z[i + 1])
order[top++] = i;
order[top++] = n - 1;
n = top; // Size of y/z -> sizeof tps/fps
n = top; // Size of y/z -> sizeof tps/fps

const auto fp = z; // Reuse
for (int i = 0; i < n; i++) {
tp[i] = tp[order[i]]; // order is mono. inc.
fp[i] = 1 + order[i] - tp[i]; // Type conversion prevents vectorization
const auto fp = z; // Reuse
for (int i = 0; i < n; i++)
{
tp[i] = tp[order[i]]; // order is mono. inc.
fp[i] = 1 + order[i] - tp[i]; // Type conversion prevents vectorization
}
delete[] order;

const auto tpn = tp[n - 1], fpn = fp[n - 1];
for (int i = 0; i < n; i++) { // Vectorization
for (int i = 0; i < n; i++)
{ // Vectorization
tp[i] /= tpn;
fp[i] /= fpn;
}

auto area = tp[0] * fp[0] / 2; // The first triangle from origin;
double partial = 0; // For Kahan summation
for (int i = 1; i < n; i++) {
auto area = tp[0] * fp[0] / 2; // The first triangle from origin;
double partial = 0; // For Kahan summation
for (int i = 1; i < n; i++)
{
const auto x = (fp[i] - fp[i - 1]) * (tp[i] + tp[i - 1]) / 2 - partial;
const auto sum = area + x;
partial = (sum - area) - x;
Expand All @@ -91,7 +98,8 @@ double AUROC(const T1 label[], const T2 score[], int n) {
}

template <typename TP>
std::time_t to_time_t(TP tp) {
std::time_t to_time_t(TP tp)
{
auto sctp = time_point_cast<system_clock::duration>(tp - TP::clock::now() +
system_clock::now());
return system_clock::to_time_t(sctp);
Expand All @@ -100,11 +108,14 @@ std::time_t to_time_t(TP tp) {
template <typename InputIt,
typename T = typename std::iterator_traits<InputIt>::value_type>
std::vector<std::vector<T>> partitionChunk(InputIt first, InputIt last,
unsigned size) {
unsigned size)
{
std::vector<std::vector<T>> result;
std::vector<T>* batch{};
for (unsigned index = 0, row = 0; first != last; ++first, ++index) {
if ((index % size) == 0) {
std::vector<T> *batch{};
for (unsigned index = 0, row = 0; first != last; ++first, ++index)
{
if ((index % size) == 0)
{
result.resize(++row);
batch = &result.back();
batch->reserve(size);
Expand All @@ -116,22 +127,28 @@ std::vector<std::vector<T>> partitionChunk(InputIt first, InputIt last,

std::time_t getTimeStampNow();

template <typename T>
std::vector<T> flatten(const std::vector<std::vector<T>> &vec2D)
{
std::vector<T> result;
for (const auto &row : vec2D)
{
result.insert(result.end(), row.begin(), row.end());
}
return result;
}

template<typename T>
std::vector<T> flatten(const std::vector<std::vector<T>>& vec2D) {
std::vector<T> result;
for (const auto& row : vec2D) {
result.insert(result.end(), row.begin(), row.end());
template <typename T>
void print2DVector(const std::vector<std::vector<T>> &vec)
{
for (const auto &row : vec)
{
for (const T &elem : row)
{
std::cout << elem << " ";
}
return result;
std::cout << "\n";
}
}

template<typename T>
void print2DVector(const std::vector<std::vector<T>>& vec) {
for (const auto& row : vec) {
for (const T& elem : row) {
std::cout << elem << " ";
}
std::cout << "\n";
}
}
void calculate_embedding();
Loading

0 comments on commit 4360785

Please sign in to comment.