diff --git a/train-rank/infra/ubuntu_develop/Dockerfile b/train-rank/infra/ubuntu_develop/Dockerfile index ad8ed4b..2ccfe72 100755 --- a/train-rank/infra/ubuntu_develop/Dockerfile +++ b/train-rank/infra/ubuntu_develop/Dockerfile @@ -91,5 +91,14 @@ RUN apt update && \ apt install python3-pip -y && \ pip install cpplint +ENV EIGEN3_INCLUDE_DIR=/usr/local/include/eigen3 +RUN cd /opt && \ + curl -LJO https://gitlab.com/libeigen/eigen/-/archive/3.4.0/eigen-3.4.0.tar.gz \ + && tar -xzf eigen-3.4.0.tar.gz \ + && cd eigen-3.4.0 && \ + mkdir build && \ + cd build && \ + cmake .. && \ + make install ENTRYPOINT ["tail", "-f", "/dev/null"] \ No newline at end of file diff --git a/train-rank/src/common_tool.cpp b/train-rank/src/common_tool.cpp index 47b00e0..50290d1 100755 --- a/train-rank/src/common_tool.cpp +++ b/train-rank/src/common_tool.cpp @@ -1,6 +1,8 @@ #include #include #include +#include +using namespace Eigen; // using namespace std; @@ -8,7 +10,8 @@ INITIALIZE_EASYLOGGINGPP -void init_log() { +void init_log() +{ el::Configurations defaultConf; defaultConf.setToDefault(); // Values are always std::string @@ -20,23 +23,29 @@ void init_log() { // To set GLOBAL configurations you may use } -void printVector(const std::vector& a) { +void printVector(const std::vector &a) +{ std::cout << "The vector elements are : "; - for (int i = 0; i < a.size(); i++) std::cout << a.at(i) << ' '; + for (int i = 0; i < a.size(); i++) + std::cout << a.at(i) << ' '; } -bool isStringEmptyOrWhitespace(const std::string& str) { +bool isStringEmptyOrWhitespace(const std::string &str) +{ // Check if the string is empty - if (str.empty()) { + if (str.empty()) + { return true; } // Check if all characters in the string are whitespace return std::all_of(str.begin(), str.end(), - [](unsigned char ch) { return std::isspace(ch); }); + [](unsigned char ch) + { return std::isspace(ch); }); } -int countStringToken(const std::string& content) { +int countStringToken(const std::string &content) +{ std::vector tokens; // stringstream class check1 @@ -45,8 +54,10 @@ int countStringToken(const std::string& content) { std::string intermediate; // Tokenizing w.r.t. space ' ' - while (getline(check1, intermediate, ' ')) { - if (isStringEmptyOrWhitespace(intermediate)) { + while (getline(check1, intermediate, ' ')) + { + if (isStringEmptyOrWhitespace(intermediate)) + { continue; } tokens.push_back(intermediate); @@ -54,17 +65,22 @@ int countStringToken(const std::string& content) { return tokens.size(); } -bool isConvertibleToInt(const std::string& str) { +bool isConvertibleToInt(const std::string &str) +{ bool result = false; - try { + try + { std::stoi(str); result = true; - } catch (...) { + } + catch (...) + { } return result; } -std::time_t getTimeStampNow() { +std::time_t getTimeStampNow() +{ std::chrono::time_point tp = std::chrono::time_point_cast( std::chrono::system_clock::now()); @@ -72,11 +88,70 @@ std::time_t getTimeStampNow() { return timestamp; } -std::string envOrBlank(const char* env) { - auto envvar = std::getenv(env); - if (envvar == nullptr) { - return ""; - } else { - return std::string(envvar); +std::string envOrBlank(const char *env) +{ + auto envvar = std::getenv(env); + if (envvar == nullptr) + { + return ""; + } + else + { + return std::string(envvar); + } +} + +int getEnvInt(const char *envVar, int defaultValue) +{ + // Get the environment variable + const char *envValue = std::getenv(envVar); + + // If the environment variable does not exist, return the default value + if (envValue == nullptr) + { + return defaultValue; + } + + // Try to convert the environment variable value to an integer + try + { + return std::stoi(envValue); // Use std::stoi to convert the string to an integer + } + catch (const std::invalid_argument &e) + { + // If the conversion fails (e.g., the environment variable value cannot be converted to an integer), return the default value + std::cerr << "Error: Invalid integer in environment variable '" << envVar << "'." << std::endl; + } + catch (const std::out_of_range &e) + { + // If the integer overflows, return the default value + std::cerr << "Error: Integer overflow in environment variable '" << envVar << "'." << std::endl; + } + + // If an exception occurs, return the default value + return defaultValue; +} + +void calculate_embedding() +{ + int n = 1000; // 假设我们有 1000 个 embedding,每个是大小为 128 的 vector + int embedding_dim = 128; + + // 创建一个 Eigen 矩阵来存储这些向量 + MatrixXf embeddings(n, embedding_dim); + + // 填充这些向量 + for (int i = 0; i < n; ++i) + { + for (int j = 0; j < embedding_dim; ++j) + { + embeddings(i, j) = static_cast(i + j); } + } + + // 将 n 个向量相加 + VectorXf result = embeddings.colwise().sum(); + + // 输出结果 + std::cout << "Sum of embeddings: " << result.transpose() << std::endl; } \ No newline at end of file diff --git a/train-rank/src/common_tool.h b/train-rank/src/common_tool.h index 49786f3..783188f 100755 --- a/train-rank/src/common_tool.h +++ b/train-rank/src/common_tool.h @@ -7,11 +7,9 @@ #include #include #include - - using namespace std::chrono; -//DECLARE_string(model_path_root); +// DECLARE_string(model_path_root); #include "easylogging++.h" // using namespace std; @@ -26,18 +24,21 @@ static char TERMINUS_RECOMMEND_EMBEDDING_DIMENSION[] = void init_log(); -void printVector(const std::vector& a); +void printVector(const std::vector &a); + +bool isStringEmptyOrWhitespace(const std::string &str); -bool isStringEmptyOrWhitespace(const std::string& str); +int countStringToken(const std::string &content); -int countStringToken(const std::string& content); +bool isConvertibleToInt(const std::string &str); -bool isConvertibleToInt(const std::string& str); +std::string envOrBlank(const char *env); -std::string envOrBlank(const char* env); +int getEnvInt(const char *envVar, int defaultValue); template -double AUROC(const T1 label[], const T2 score[], int n) { +double AUROC(const T1 label[], const T2 score[], int n) +{ for (int i = 0; i < n; i++) if (!std::isfinite(score[i]) || label[i] != 0 && label[i] != 1) return std::numeric_limits::signaling_NaN(); @@ -45,39 +46,45 @@ double AUROC(const T1 label[], const T2 score[], int n) { const auto order = new int[n]; std::iota(order, order + n, 0); std::sort(order, order + n, - [&](int a, int b) { return score[a] > score[b]; }); + [&](int a, int b) + { return score[a] > score[b]; }); const auto y = new double[n]; const auto z = new double[n]; - for (int i = 0; i < n; i++) { + for (int i = 0; i < n; i++) + { y[i] = label[order[i]]; z[i] = score[order[i]]; } - const auto tp = y; // Reuse + const auto tp = y; // Reuse std::partial_sum(y, y + n, tp); - int top = 0; // # diff + int top = 0; // # diff for (int i = 0; i < n - 1; i++) - if (z[i] != z[i + 1]) order[top++] = i; + if (z[i] != z[i + 1]) + order[top++] = i; order[top++] = n - 1; - n = top; // Size of y/z -> sizeof tps/fps + n = top; // Size of y/z -> sizeof tps/fps - const auto fp = z; // Reuse - for (int i = 0; i < n; i++) { - tp[i] = tp[order[i]]; // order is mono. inc. - fp[i] = 1 + order[i] - tp[i]; // Type conversion prevents vectorization + const auto fp = z; // Reuse + for (int i = 0; i < n; i++) + { + tp[i] = tp[order[i]]; // order is mono. inc. + fp[i] = 1 + order[i] - tp[i]; // Type conversion prevents vectorization } delete[] order; const auto tpn = tp[n - 1], fpn = fp[n - 1]; - for (int i = 0; i < n; i++) { // Vectorization + for (int i = 0; i < n; i++) + { // Vectorization tp[i] /= tpn; fp[i] /= fpn; } - auto area = tp[0] * fp[0] / 2; // The first triangle from origin; - double partial = 0; // For Kahan summation - for (int i = 1; i < n; i++) { + auto area = tp[0] * fp[0] / 2; // The first triangle from origin; + double partial = 0; // For Kahan summation + for (int i = 1; i < n; i++) + { const auto x = (fp[i] - fp[i - 1]) * (tp[i] + tp[i - 1]) / 2 - partial; const auto sum = area + x; partial = (sum - area) - x; @@ -91,7 +98,8 @@ double AUROC(const T1 label[], const T2 score[], int n) { } template -std::time_t to_time_t(TP tp) { +std::time_t to_time_t(TP tp) +{ auto sctp = time_point_cast(tp - TP::clock::now() + system_clock::now()); return system_clock::to_time_t(sctp); @@ -100,11 +108,14 @@ std::time_t to_time_t(TP tp) { template ::value_type> std::vector> partitionChunk(InputIt first, InputIt last, - unsigned size) { + unsigned size) +{ std::vector> result; - std::vector* batch{}; - for (unsigned index = 0, row = 0; first != last; ++first, ++index) { - if ((index % size) == 0) { + std::vector *batch{}; + for (unsigned index = 0, row = 0; first != last; ++first, ++index) + { + if ((index % size) == 0) + { result.resize(++row); batch = &result.back(); batch->reserve(size); @@ -116,22 +127,28 @@ std::vector> partitionChunk(InputIt first, InputIt last, std::time_t getTimeStampNow(); +template +std::vector flatten(const std::vector> &vec2D) +{ + std::vector result; + for (const auto &row : vec2D) + { + result.insert(result.end(), row.begin(), row.end()); + } + return result; +} -template -std::vector flatten(const std::vector>& vec2D) { - std::vector result; - for (const auto& row : vec2D) { - result.insert(result.end(), row.begin(), row.end()); +template +void print2DVector(const std::vector> &vec) +{ + for (const auto &row : vec) + { + for (const T &elem : row) + { + std::cout << elem << " "; } - return result; + std::cout << "\n"; + } } -template -void print2DVector(const std::vector>& vec) { - for (const auto& row : vec) { - for (const T& elem : row) { - std::cout << elem << " "; - } - std::cout << "\n"; - } -} \ No newline at end of file +void calculate_embedding(); \ No newline at end of file diff --git a/train-rank/src/rssrank.cpp b/train-rank/src/rssrank.cpp index 571cdce..0521eb0 100755 --- a/train-rank/src/rssrank.cpp +++ b/train-rank/src/rssrank.cpp @@ -14,6 +14,8 @@ namespace fs = std::filesystem; #include #include +#include +using namespace Eigen; #include "common_tool.h" #include "data_process.h" @@ -120,13 +122,60 @@ namespace rssrank return 1.0; } - double result = 1.8 / (1.0 + std::exp(-86400.0 / diff)) - 0.8; + double result = 1.8 / (1.0 + std::exp(-86400.0 / diff)) - 0.8; // when diff->0,result->1.0; when diff->+infinite,result->0.1; when diff=86400,result=0.55 return result; } } // namespace + double getTimeCoefficientForUnixTimestamp(long long timestamp) + { + auto now = boost::posix_time::second_clock::local_time(); + auto diff = getTimeStampNow() - timestamp; + + if (diff <= 0) + { + return 1.0; + } + + double result = 1.8 / (1.0 + std::exp(-86400.0 / diff)) - 0.8; // when diff->0,result->1.0; when diff->+infinite,result->0.1; when diff=86400,result=0.55 + + return result; + } + + vector calcluateEmbedding(const vector &impressions, bool with_weight) + { + if (impressions.empty()) + { + return {}; + } + int row = impressions.size(); + int col = impressions[0].embedding.value().size(); + MatrixXf embeddings(row, col); + + // Fill these vectors + for (int i = 0; i < row; ++i) + { + for (int j = 0; j < col; ++j) + { + embeddings(i, j) = impressions[i].embedding.value()[j]; + } + } + if (with_weight) + { + // Sum the vectors + for (int i = 0; i < row; ++i) + { + embeddings.row(i) = embeddings.row(i) * getSpecificImpressionScore(impressions[i]); + } + } + // Sum the vectors + VectorXf eig_vec = embeddings.colwise().sum(); + eig_vec = eig_vec / eig_vec.norm(); + return std::vector(eig_vec.data(), eig_vec.data() + eig_vec.size()); + } + vector getImpressionForShortTermAndLongTermUserEmbeddingRank() { // this method just need positive sampllle @@ -199,6 +248,24 @@ namespace rssrank } } + float getSpecificImpressionScoreForShortTermUserEmbedding(const Impression ¤t_impression) + { + float total_score = 0; + if (current_impression.clicked) + { + total_score = total_score + rssrank::short_term_user_embedding_clicked_weight; + if (current_impression.read_finish) + { + total_score = total_score + rssrank::short_term_user_embedding_read_finish_weight; + } + if (current_impression.stared) + { + total_score = total_score + rssrank::short_term_user_embedding_stared_weight; + } + total_score = total_score + rssrank::short_term_user_embedding_time_weight * getTimeCoefficientForUnixTimestamp(current_impression.entry_last_opened); + } + return total_score; + } float getSpecificImpressionScore(const Impression ¤t_impression) { // to do @@ -864,6 +931,7 @@ namespace rssrank bool rankShortTermAndLongTermUserEmbedding() { knowledgebase::EntryCache::getInstance().init(); + vector impressions = getImpressionForShortTermAndLongTermUserEmbeddingRank(); return true; } diff --git a/train-rank/src/rssrank.h b/train-rank/src/rssrank.h index 39b7d98..679975b 100755 --- a/train-rank/src/rssrank.h +++ b/train-rank/src/rssrank.h @@ -31,6 +31,11 @@ namespace rssrank static const float read_speed_weight = 0.3; static const float star_weight = 0.4; + static const float short_term_user_embedding_clicked_weight = 0.1; + static const float short_term_user_embedding_read_finish_weight = 0.3; + static const float short_term_user_embedding_stared_weight = 0.4; + static const float short_term_user_embedding_time_weight = 0.2; + enum class ModelPathType { TRAINING, @@ -83,5 +88,8 @@ namespace rssrank int test_rows, float *test_labels, BoosterHandle h_booster, float *biggest_auc); std::vector getImpressionForShortTermAndLongTermUserEmbeddingRank(); + std::vector calcluateEmbedding(const std::vector &impressions, bool with_weight); + double getTimeCoefficientForUnixTimestamp(long long timestamp); + float getSpecificImpressionScoreForShortTermUserEmbedding(const Impression ¤t_impression); } // namespace rssrank diff --git a/train-rank/test/knowledge_api_test.cpp b/train-rank/test/knowledge_api_test.cpp index 06cd682..067de65 100755 --- a/train-rank/test/knowledge_api_test.cpp +++ b/train-rank/test/knowledge_api_test.cpp @@ -196,4 +196,35 @@ TEST(RssRankTest, getImpressionForShortTermAndLongTermUserEmbeddingRank) { std::cout << "current_last_opened " << current.entry_last_opened << std::endl; } +} + +TEST(RssRankTest, TestCalculateEmbeddingMultipleReal) +{ + // --gtest_filter=RssRankTest.TestCalculateEmbeddingMultipleReal + initDevelop(); + init_log(); + knowledgebase::EntryCache::getInstance().init(); + std::vector result = rssrank::getImpressionForShortTermAndLongTermUserEmbeddingRank(); + std::vector embedding = rssrank::calcluateEmbedding(result, true); + for (auto current : embedding) + { + std::cout << current << " "; + } +} + +TEST(RssRankTest, TestCalculateEmbeddingMultiple) +{ + // --gtest_filter=RssRankTest.TestCalculateEmbeddingMultiple + initDevelop(); + init_log(); + + Impression impression1, impression2; + impression1.embedding = std::vector{1.0, 2.0, 3.0}; + impression2.embedding = std::vector{4.0, 5.0, 6.0}; + std::vector impressions = {impression1, impression2}; + std::vector result = rssrank::calcluateEmbedding(impressions, false); + ASSERT_EQ(result.size(), 3); + EXPECT_FLOAT_EQ(result[0], 5.0); + EXPECT_FLOAT_EQ(result[1], 7.0); + EXPECT_FLOAT_EQ(result[2], 9.0); } \ No newline at end of file