From 02a69632d458a904d0cd6298088fa53843f0642c Mon Sep 17 00:00:00 2001 From: Jiahao Li Date: Sun, 22 Oct 2023 10:58:56 +0800 Subject: [PATCH] Remove duplicate test codes (#150) --- README.md | 4 +- chatglm.h | 2 +- chatglm_test.cpp | 485 ++++++++++------------------------------------- 3 files changed, 103 insertions(+), 388 deletions(-) diff --git a/README.md b/README.md index 0bff529..8c0b4d3 100644 --- a/README.md +++ b/README.md @@ -91,7 +91,7 @@ Run `./build/bin/main -h` to explore more options! **Try Other Models** -
+
ChatGLM2-6B ```sh @@ -263,7 +263,7 @@ python3 web_demo.py -m ../chatglm-ggml.bin For other models: -
+
ChatGLM2-6B ```sh diff --git a/chatglm.h b/chatglm.h index d148749..6f3f8a9 100644 --- a/chatglm.h +++ b/chatglm.h @@ -1072,7 +1072,7 @@ class InternLM7BForCausalLM : public BasicModelForCausalLM { public: static constexpr size_t MEM_SIZE = 512 * MB; - static constexpr size_t SCRATCH_SIZE = 1024 * MB; + static constexpr size_t SCRATCH_SIZE = 1280 * MB; }; using InternLM20BAttention = diff --git a/chatglm_test.cpp b/chatglm_test.cpp index 72dd96d..cc54139 100644 --- a/chatglm_test.cpp +++ b/chatglm_test.cpp @@ -251,6 +251,78 @@ class ChatGLMTest : public ::testing::Test { float perf_cpu_graph_compute() { return _perf_graph_compute_impl(); } float perf_device_graph_compute() { return _perf_graph_compute_impl(); } + + template + void test_model(const Model &model, const ModelConfig &config, const fs::path &data_path, int seq_len, + const std::vector &all_weights) { + ASSERT_EQ(config.num_hidden_layers, 1); + + MappedFile mapped_file(data_path.string()); + char *ptr = mapped_file.data; + + tensor_to_device(model.layers[0].attention.k_cache); + tensor_to_device(model.layers[0].attention.v_cache); + + ggml_tensor *x1 = ggml_new_tensor_1d(ctx.ctx_b.get(), GGML_TYPE_I32, seq_len); + ggml_tensor *ref_y1 = ggml_new_tensor_2d(ctx.ctx_b.get(), GGML_TYPE_F32, config.hidden_size, seq_len); + ggml_tensor *x2 = ggml_new_tensor_1d(ctx.ctx_b.get(), GGML_TYPE_I32, 1); + ggml_tensor *ref_y2 = ggml_new_tensor_1d(ctx.ctx_b.get(), GGML_TYPE_F32, config.hidden_size); + ggml_tensor *x3 = ggml_new_tensor_1d(ctx.ctx_b.get(), GGML_TYPE_I32, 1); + ggml_tensor *ref_y3 = ggml_new_tensor_1d(ctx.ctx_b.get(), GGML_TYPE_F32, config.hidden_size); + + std::vector all_tensors = all_weights; + all_tensors.insert(all_tensors.end(), {x1, ref_y1, x2, ref_y2, x3, ref_y3}); + + std::vector cpu_tensors{model.word_embeddings.weight, x1, x2, x3}; + + for (auto tensor : all_tensors) { + ptr = read_tensor_data(ptr, tensor); + if (std::find(cpu_tensors.begin(), cpu_tensors.end(), tensor) == cpu_tensors.end()) { + tensor_to_device(tensor); + } + } + + ASSERT_EQ(ptr, mapped_file.data + mapped_file.size); + + // self attention + { + ggml_tensor *out_y1 = model.forward(&ctx, x1, 0, seq_len); + EXPECT_EQ(out_y1->backend, ref_y1->backend); + out_y1->backend = GGML_BACKEND_CPU; + ggml_build_forward_expand(&ctx.gf, out_y1); + device_graph_compute(1); + + expect_all_close(ref_y1, out_y1, 5e-4); + } + + // cross attention + reset_cgraph(); + { + ggml_tensor *out_y2 = model.forward(&ctx, x2, seq_len, seq_len); + EXPECT_EQ(out_y2->backend, ref_y2->backend); + out_y2->backend = GGML_BACKEND_CPU; + ggml_build_forward_expand(&ctx.gf, out_y2); + device_graph_compute(1); + + expect_all_close(ref_y2, out_y2, 5e-4); + } + reset_cgraph(); + { + ggml_tensor *out_y3 = model.forward(&ctx, x3, seq_len + 1, seq_len); + EXPECT_EQ(out_y3->backend, ref_y3->backend); + out_y3->backend = GGML_BACKEND_CPU; + ggml_build_forward_expand(&ctx.gf, out_y3); + device_graph_compute(1); + + expect_all_close(ref_y3, out_y3, 5e-4); + } + + for (auto tensor : all_tensors) { + tensor_to_cpu(tensor); + } + tensor_to_cpu(model.layers[0].attention.k_cache); + tensor_to_cpu(model.layers[0].attention.v_cache); + } }; TEST_F(ChatGLMTest, Embedding) { @@ -483,9 +555,7 @@ TEST_F(ChatGLMTest, BenchmarkRMSNorm) { } TEST_F(ChatGLMTest, GLMModel) { - fs::path test_path = fs::path(__FILE__).parent_path() / "tests/data/glm_model.data"; - MappedFile mapped_file(test_path.string()); - char *ptr = mapped_file.data; + fs::path data_path = fs::path(__FILE__).parent_path() / "tests/data/glm_model.data"; ModelConfig config; config.hidden_size = 32; @@ -501,17 +571,7 @@ TEST_F(ChatGLMTest, GLMModel) { ChatGLMModel model(&ctx, config); - tensor_to_device(model.layers[0].attention.k_cache); - tensor_to_device(model.layers[0].attention.v_cache); - - ggml_tensor *x1 = ggml_new_tensor_1d(ctx.ctx_b.get(), GGML_TYPE_I32, seq_len); - ggml_tensor *ref_y1 = ggml_new_tensor_2d(ctx.ctx_b.get(), GGML_TYPE_F32, config.hidden_size, seq_len); - ggml_tensor *x2 = ggml_new_tensor_1d(ctx.ctx_b.get(), GGML_TYPE_I32, 1); - ggml_tensor *ref_y2 = ggml_new_tensor_1d(ctx.ctx_b.get(), GGML_TYPE_F32, config.hidden_size); - ggml_tensor *x3 = ggml_new_tensor_1d(ctx.ctx_b.get(), GGML_TYPE_I32, 1); - ggml_tensor *ref_y3 = ggml_new_tensor_1d(ctx.ctx_b.get(), GGML_TYPE_F32, config.hidden_size); - - std::vector all_tensors{model.word_embeddings.weight, + std::vector all_weights{model.word_embeddings.weight, model.layers[0].input_layernorm.weight, model.layers[0].input_layernorm.bias, model.layers[0].attention.query_key_value.weight, @@ -525,62 +585,9 @@ TEST_F(ChatGLMTest, GLMModel) { model.layers[0].mlp.dense_4h_to_h.weight, model.layers[0].mlp.dense_4h_to_h.bias, model.final_layernorm.weight, - model.final_layernorm.bias, - x1, - ref_y1, - x2, - ref_y2, - x3, - ref_y3}; - std::vector cpu_tensors{model.word_embeddings.weight, x1, x2, x3}; - - for (auto tensor : all_tensors) { - ptr = read_tensor_data(ptr, tensor); - if (std::find(cpu_tensors.begin(), cpu_tensors.end(), tensor) == cpu_tensors.end()) { - tensor_to_device(tensor); - } - } + model.final_layernorm.bias}; - ASSERT_EQ(ptr, mapped_file.data + mapped_file.size); - - // self attention - { - ggml_tensor *out_y1 = model.forward(&ctx, x1, 0, seq_len); - EXPECT_EQ(out_y1->backend, ref_y1->backend); - out_y1->backend = GGML_BACKEND_CPU; - ggml_build_forward_expand(&ctx.gf, out_y1); - device_graph_compute(1); - - expect_all_close(ref_y1, out_y1, 5e-4); - } - - // cross attention - reset_cgraph(); - { - ggml_tensor *out_y2 = model.forward(&ctx, x2, seq_len, seq_len); - EXPECT_EQ(out_y2->backend, ref_y2->backend); - out_y2->backend = GGML_BACKEND_CPU; - ggml_build_forward_expand(&ctx.gf, out_y2); - device_graph_compute(1); - - expect_all_close(ref_y2, out_y2, 5e-4); - } - reset_cgraph(); - { - ggml_tensor *out_y3 = model.forward(&ctx, x3, seq_len + 1, seq_len); - EXPECT_EQ(out_y3->backend, ref_y3->backend); - out_y3->backend = GGML_BACKEND_CPU; - ggml_build_forward_expand(&ctx.gf, out_y3); - device_graph_compute(1); - - expect_all_close(ref_y3, out_y3, 5e-4); - } - - for (auto tensor : all_tensors) { - tensor_to_cpu(tensor); - } - tensor_to_cpu(model.layers[0].attention.k_cache); - tensor_to_cpu(model.layers[0].attention.v_cache); + test_model(model, config, data_path, seq_len, all_weights); } // TEST_F(ChatGLMTest, BenchmarkGLMBlock) { @@ -645,9 +652,7 @@ TEST_F(ChatGLMTest, GLMModel) { // } TEST_F(ChatGLMTest, GLM2Model) { - fs::path test_path = fs::path(__FILE__).parent_path() / "tests/data/glm2_model.data"; - MappedFile mapped_file(test_path.string()); - char *ptr = mapped_file.data; + fs::path data_path = fs::path(__FILE__).parent_path() / "tests/data/glm2_model.data"; ModelConfig config; config.vocab_size = 5; @@ -666,14 +671,7 @@ TEST_F(ChatGLMTest, GLM2Model) { tensor_to_device(model.layers[0].attention.k_cache); tensor_to_device(model.layers[0].attention.v_cache); - ggml_tensor *x1 = ggml_new_tensor_1d(ctx.ctx_b.get(), GGML_TYPE_I32, seq_len); - ggml_tensor *ref_y1 = ggml_new_tensor_2d(ctx.ctx_b.get(), GGML_TYPE_F32, config.hidden_size, seq_len); - ggml_tensor *x2 = ggml_new_tensor_1d(ctx.ctx_b.get(), GGML_TYPE_I32, 1); - ggml_tensor *ref_y2 = ggml_new_tensor_1d(ctx.ctx_b.get(), GGML_TYPE_F32, config.hidden_size); - ggml_tensor *x3 = ggml_new_tensor_1d(ctx.ctx_b.get(), GGML_TYPE_I32, 1); - ggml_tensor *ref_y3 = ggml_new_tensor_1d(ctx.ctx_b.get(), GGML_TYPE_F32, config.hidden_size); - - std::vector all_tensors{model.word_embeddings.weight, + std::vector all_weights{model.word_embeddings.weight, model.layers[0].input_layernorm.weight, model.layers[0].attention.query_key_value.weight, model.layers[0].attention.query_key_value.bias, @@ -682,80 +680,9 @@ TEST_F(ChatGLMTest, GLM2Model) { model.layers[0].mlp.gate_proj.weight, model.layers[0].mlp.up_proj.weight, model.layers[0].mlp.down_proj.weight, - model.final_layernorm.weight, - x1, - ref_y1, - x2, - ref_y2, - x3, - ref_y3}; - std::vector cpu_tensors{model.word_embeddings.weight, x1, x2, x3}; - - for (auto tensor : all_tensors) { - ptr = read_tensor_data(ptr, tensor); - if (std::find(cpu_tensors.begin(), cpu_tensors.end(), tensor) == cpu_tensors.end()) { - tensor_to_device(tensor); - } - } - - ASSERT_EQ(ptr, mapped_file.data + mapped_file.size); - - float eps = 2e-4; - -#ifdef GGML_USE_METAL - // convert gemm weights to fp16 - std::vector gemm_weight_ptrs{ - &model.layers[0].attention.query_key_value.weight, &model.layers[0].attention.dense.weight, - &model.layers[0].mlp.gate_proj.weight, &model.layers[0].mlp.up_proj.weight, - &model.layers[0].mlp.down_proj.weight}; - for (auto weight_ptr : gemm_weight_ptrs) { - ggml_tensor *weight = *weight_ptr; - ggml_tensor *fp16_weight = ggml_new_tensor(ctx.ctx_b.get(), GGML_TYPE_F16, weight->n_dims, weight->ne); - ggml_fp32_to_fp16_row((float *)weight->data, (ggml_fp16_t *)fp16_weight->data, ggml_nelements(weight)); - *weight_ptr = fp16_weight; - } - eps = 5e-4; -#endif + model.final_layernorm.weight}; - // self attention - reset_cgraph(); - { - ggml_tensor *out_y1 = model.forward(&ctx, x1, 0, seq_len); - EXPECT_EQ(out_y1->backend, ref_y1->backend); - out_y1->backend = GGML_BACKEND_CPU; - ggml_build_forward_expand(&ctx.gf, out_y1); - device_graph_compute(get_num_threads()); - - expect_all_close(ref_y1, out_y1, eps); - } - - // cross attention - reset_cgraph(); - { - ggml_tensor *out_y2 = model.forward(&ctx, x2, seq_len, seq_len); - EXPECT_EQ(out_y2->backend, ref_y2->backend); - out_y2->backend = GGML_BACKEND_CPU; - ggml_build_forward_expand(&ctx.gf, out_y2); - device_graph_compute(get_num_threads()); - - expect_all_close(ref_y2, out_y2, eps); - } - reset_cgraph(); - { - ggml_tensor *out_y3 = model.forward(&ctx, x3, seq_len + 1, seq_len); - EXPECT_EQ(out_y3->backend, ref_y3->backend); - out_y3->backend = GGML_BACKEND_CPU; - ggml_build_forward_expand(&ctx.gf, out_y3); - device_graph_compute(get_num_threads()); - - expect_all_close(ref_y3, out_y3, eps); - } - - for (auto tensor : all_tensors) { - tensor_to_cpu(tensor); - } - tensor_to_cpu(model.layers[0].attention.k_cache); - tensor_to_cpu(model.layers[0].attention.v_cache); + test_model(model, config, data_path, seq_len, all_weights); } // TEST_F(ChatGLMTest, BenchmarkGLM2Block) { @@ -825,9 +752,7 @@ TEST_F(ChatGLMTest, GLM2Model) { // } TEST_F(ChatGLMTest, Baichuan7BModel) { - fs::path test_path = fs::path(__FILE__).parent_path() / "tests/data/baichuan7b_model.data"; - MappedFile mapped_file(test_path.string()); - char *ptr = mapped_file.data; + fs::path data_path = fs::path(__FILE__).parent_path() / "tests/data/baichuan7b_model.data"; ModelConfig config; config.hidden_size = 32; @@ -843,17 +768,7 @@ TEST_F(ChatGLMTest, Baichuan7BModel) { Baichuan7BModel model(&ctx, config); - tensor_to_device(model.layers[0].attention.k_cache); - tensor_to_device(model.layers[0].attention.v_cache); - - ggml_tensor *x1 = ggml_new_tensor_1d(ctx.ctx_b.get(), GGML_TYPE_I32, seq_len); - ggml_tensor *ref_y1 = ggml_new_tensor_2d(ctx.ctx_b.get(), GGML_TYPE_F32, config.hidden_size, seq_len); - ggml_tensor *x2 = ggml_new_tensor_1d(ctx.ctx_b.get(), GGML_TYPE_I32, 1); - ggml_tensor *ref_y2 = ggml_new_tensor_1d(ctx.ctx_b.get(), GGML_TYPE_F32, config.hidden_size); - ggml_tensor *x3 = ggml_new_tensor_1d(ctx.ctx_b.get(), GGML_TYPE_I32, 1); - ggml_tensor *ref_y3 = ggml_new_tensor_1d(ctx.ctx_b.get(), GGML_TYPE_F32, config.hidden_size); - - std::vector all_tensors{model.word_embeddings.weight, + std::vector all_weights{model.word_embeddings.weight, model.layers[0].input_layernorm.weight, model.layers[0].attention.query_key_value.weight, model.layers[0].attention.dense.weight, @@ -861,70 +776,13 @@ TEST_F(ChatGLMTest, Baichuan7BModel) { model.layers[0].mlp.gate_proj.weight, model.layers[0].mlp.down_proj.weight, model.layers[0].mlp.up_proj.weight, - model.final_layernorm.weight, - x1, - ref_y1, - x2, - ref_y2, - x3, - ref_y3}; - std::vector cpu_tensors{model.word_embeddings.weight, x1, x2, x3}; + model.final_layernorm.weight}; - for (auto tensor : all_tensors) { - ptr = read_tensor_data(ptr, tensor); - if (std::find(cpu_tensors.begin(), cpu_tensors.end(), tensor) == cpu_tensors.end()) { - tensor_to_device(tensor); - } - } - ASSERT_EQ(ptr, mapped_file.data + mapped_file.size); - - float eps = 5e-4; - - // self attention - reset_cgraph(); - { - ggml_tensor *out_y1 = model.forward(&ctx, x1, 0, seq_len); - EXPECT_EQ(out_y1->backend, ref_y1->backend); - out_y1->backend = GGML_BACKEND_CPU; - ggml_build_forward_expand(&ctx.gf, out_y1); - device_graph_compute(get_num_threads()); - - expect_all_close(ref_y1, out_y1, eps); - } - - // cross attention - reset_cgraph(); - { - ggml_tensor *out_y2 = model.forward(&ctx, x2, seq_len, seq_len); - EXPECT_EQ(out_y2->backend, ref_y2->backend); - out_y2->backend = GGML_BACKEND_CPU; - ggml_build_forward_expand(&ctx.gf, out_y2); - device_graph_compute(get_num_threads()); - - expect_all_close(ref_y2, out_y2, eps); - } - reset_cgraph(); - { - ggml_tensor *out_y3 = model.forward(&ctx, x3, seq_len + 1, seq_len); - EXPECT_EQ(out_y3->backend, ref_y3->backend); - out_y3->backend = GGML_BACKEND_CPU; - ggml_build_forward_expand(&ctx.gf, out_y3); - device_graph_compute(get_num_threads()); - - expect_all_close(ref_y3, out_y3, eps); - } - - for (auto tensor : all_tensors) { - tensor_to_cpu(tensor); - } - tensor_to_cpu(model.layers[0].attention.k_cache); - tensor_to_cpu(model.layers[0].attention.v_cache); + test_model(model, config, data_path, seq_len, all_weights); } TEST_F(ChatGLMTest, Baichuan13BModel) { - fs::path test_path = fs::path(__FILE__).parent_path() / "tests/data/baichuan13b_model.data"; - MappedFile mapped_file(test_path.string()); - char *ptr = mapped_file.data; + fs::path data_path = fs::path(__FILE__).parent_path() / "tests/data/baichuan13b_model.data"; ModelConfig config; config.hidden_size = 32; @@ -940,17 +798,7 @@ TEST_F(ChatGLMTest, Baichuan13BModel) { Baichuan13BModel model(&ctx, config); - tensor_to_device(model.layers[0].attention.k_cache); - tensor_to_device(model.layers[0].attention.v_cache); - - ggml_tensor *x1 = ggml_new_tensor_1d(ctx.ctx_b.get(), GGML_TYPE_I32, seq_len); - ggml_tensor *ref_y1 = ggml_new_tensor_2d(ctx.ctx_b.get(), GGML_TYPE_F32, config.hidden_size, seq_len); - ggml_tensor *x2 = ggml_new_tensor_1d(ctx.ctx_b.get(), GGML_TYPE_I32, 1); - ggml_tensor *ref_y2 = ggml_new_tensor_1d(ctx.ctx_b.get(), GGML_TYPE_F32, config.hidden_size); - ggml_tensor *x3 = ggml_new_tensor_1d(ctx.ctx_b.get(), GGML_TYPE_I32, 1); - ggml_tensor *ref_y3 = ggml_new_tensor_1d(ctx.ctx_b.get(), GGML_TYPE_F32, config.hidden_size); - - std::vector all_tensors{model.word_embeddings.weight, + std::vector all_weights{model.word_embeddings.weight, model.layers[0].input_layernorm.weight, model.layers[0].attention.query_key_value.weight, model.layers[0].attention.dense.weight, @@ -958,85 +806,13 @@ TEST_F(ChatGLMTest, Baichuan13BModel) { model.layers[0].mlp.gate_proj.weight, model.layers[0].mlp.down_proj.weight, model.layers[0].mlp.up_proj.weight, - model.final_layernorm.weight, - x1, - ref_y1, - x2, - ref_y2, - x3, - ref_y3}; - std::vector cpu_tensors{model.word_embeddings.weight, x1, x2, x3}; - - for (auto tensor : all_tensors) { - ptr = read_tensor_data(ptr, tensor); - if (std::find(cpu_tensors.begin(), cpu_tensors.end(), tensor) == cpu_tensors.end()) { - tensor_to_device(tensor); - } - } - ASSERT_EQ(ptr, mapped_file.data + mapped_file.size); - - float eps = 5e-4; - -#ifdef GGML_USE_METAL - // convert gemm weights to fp16 - std::vector gemm_weight_ptrs{ - &model.layers[0].attention.query_key_value.weight, &model.layers[0].attention.dense.weight, - &model.layers[0].mlp.gate_proj.weight, &model.layers[0].mlp.down_proj.weight, - &model.layers[0].mlp.up_proj.weight}; - for (auto weight_ptr : gemm_weight_ptrs) { - ggml_tensor *weight = *weight_ptr; - ggml_tensor *fp16_weight = ggml_new_tensor(ctx.ctx_b.get(), GGML_TYPE_F16, weight->n_dims, weight->ne); - ggml_fp32_to_fp16_row((float *)weight->data, (ggml_fp16_t *)fp16_weight->data, ggml_nelements(weight)); - *weight_ptr = fp16_weight; - } - eps = 5e-4; -#endif - - // self attention - reset_cgraph(); - { - ggml_tensor *out_y1 = model.forward(&ctx, x1, 0, seq_len); - EXPECT_EQ(out_y1->backend, ref_y1->backend); - out_y1->backend = GGML_BACKEND_CPU; - ggml_build_forward_expand(&ctx.gf, out_y1); - device_graph_compute(get_num_threads()); + model.final_layernorm.weight}; - expect_all_close(ref_y1, out_y1, eps); - } - - // cross attention - reset_cgraph(); - { - ggml_tensor *out_y2 = model.forward(&ctx, x2, seq_len, seq_len); - EXPECT_EQ(out_y2->backend, ref_y2->backend); - out_y2->backend = GGML_BACKEND_CPU; - ggml_build_forward_expand(&ctx.gf, out_y2); - device_graph_compute(get_num_threads()); - - expect_all_close(ref_y2, out_y2, eps); - } - reset_cgraph(); - { - ggml_tensor *out_y3 = model.forward(&ctx, x3, seq_len + 1, seq_len); - EXPECT_EQ(out_y3->backend, ref_y3->backend); - out_y3->backend = GGML_BACKEND_CPU; - ggml_build_forward_expand(&ctx.gf, out_y3); - device_graph_compute(get_num_threads()); - - expect_all_close(ref_y3, out_y3, eps); - } - - for (auto tensor : all_tensors) { - tensor_to_cpu(tensor); - } - tensor_to_cpu(model.layers[0].attention.k_cache); - tensor_to_cpu(model.layers[0].attention.v_cache); + test_model(model, config, data_path, seq_len, all_weights); } TEST_F(ChatGLMTest, InternLMModel) { - fs::path test_path = fs::path(__FILE__).parent_path() / "tests/data/internlm_model.data"; - MappedFile mapped_file(test_path.string()); - char *ptr = mapped_file.data; + fs::path data_path = fs::path(__FILE__).parent_path() / "tests/data/internlm_model.data"; ModelConfig config; config.hidden_size = 32; @@ -1052,17 +828,7 @@ TEST_F(ChatGLMTest, InternLMModel) { InternLM7BModel model(&ctx, config); - tensor_to_device(model.layers[0].attention.k_cache); - tensor_to_device(model.layers[0].attention.v_cache); - - ggml_tensor *x1 = ggml_new_tensor_1d(ctx.ctx_b.get(), GGML_TYPE_I32, seq_len); - ggml_tensor *ref_y1 = ggml_new_tensor_2d(ctx.ctx_b.get(), GGML_TYPE_F32, config.hidden_size, seq_len); - ggml_tensor *x2 = ggml_new_tensor_1d(ctx.ctx_b.get(), GGML_TYPE_I32, 1); - ggml_tensor *ref_y2 = ggml_new_tensor_1d(ctx.ctx_b.get(), GGML_TYPE_F32, config.hidden_size); - ggml_tensor *x3 = ggml_new_tensor_1d(ctx.ctx_b.get(), GGML_TYPE_I32, 1); - ggml_tensor *ref_y3 = ggml_new_tensor_1d(ctx.ctx_b.get(), GGML_TYPE_F32, config.hidden_size); - - std::vector all_tensors{model.word_embeddings.weight, + std::vector all_weights{model.word_embeddings.weight, model.layers[0].input_layernorm.weight, model.layers[0].attention.query_key_value.weight, model.layers[0].attention.query_key_value.bias, @@ -1072,64 +838,9 @@ TEST_F(ChatGLMTest, InternLMModel) { model.layers[0].mlp.gate_proj.weight, model.layers[0].mlp.up_proj.weight, model.layers[0].mlp.down_proj.weight, - model.final_layernorm.weight, - x1, - ref_y1, - x2, - ref_y2, - x3, - ref_y3}; - std::vector cpu_tensors{model.word_embeddings.weight, x1, x2, x3}; + model.final_layernorm.weight}; - for (auto tensor : all_tensors) { - ptr = read_tensor_data(ptr, tensor); - if (std::find(cpu_tensors.begin(), cpu_tensors.end(), tensor) == cpu_tensors.end()) { - tensor_to_device(tensor); - } - } - ASSERT_EQ(ptr, mapped_file.data + mapped_file.size); - - float eps = 5e-4; - - // self attention - reset_cgraph(); - { - ggml_tensor *out_y1 = model.forward(&ctx, x1, 0, seq_len); - EXPECT_EQ(out_y1->backend, ref_y1->backend); - out_y1->backend = GGML_BACKEND_CPU; - ggml_build_forward_expand(&ctx.gf, out_y1); - device_graph_compute(get_num_threads()); - - expect_all_close(ref_y1, out_y1, eps); - } - - // cross attention - reset_cgraph(); - { - ggml_tensor *out_y2 = model.forward(&ctx, x2, seq_len, seq_len); - EXPECT_EQ(out_y2->backend, ref_y2->backend); - out_y2->backend = GGML_BACKEND_CPU; - ggml_build_forward_expand(&ctx.gf, out_y2); - device_graph_compute(get_num_threads()); - - expect_all_close(ref_y2, out_y2, eps); - } - reset_cgraph(); - { - ggml_tensor *out_y3 = model.forward(&ctx, x3, seq_len + 1, seq_len); - EXPECT_EQ(out_y3->backend, ref_y3->backend); - out_y3->backend = GGML_BACKEND_CPU; - ggml_build_forward_expand(&ctx.gf, out_y3); - device_graph_compute(get_num_threads()); - - expect_all_close(ref_y3, out_y3, eps); - } - - for (auto tensor : all_tensors) { - tensor_to_cpu(tensor); - } - tensor_to_cpu(model.layers[0].attention.k_cache); - tensor_to_cpu(model.layers[0].attention.v_cache); + test_model(model, config, data_path, seq_len, all_weights); } TEST_F(ChatGLMTest, quantize) { @@ -1236,6 +947,7 @@ TEST_F(ChatGLMTest, quantize) { struct TokenizerTestCase { std::string prompt; std::vector input_ids; + bool skip_decode = false; }; static void check_tokenizer(const BaseTokenizer *tokenizer, const std::vector &cases) { @@ -1243,9 +955,11 @@ static void check_tokenizer(const BaseTokenizer *tokenizer, const std::vector input_ids = tokenizer->encode(c.prompt, 2048); EXPECT_TRUE(equal(input_ids, c.input_ids)); - // decode - std::string output = tokenizer->decode(c.input_ids); - EXPECT_EQ(output, c.prompt); + if (!c.skip_decode) { + // decode + std::string output = tokenizer->decode(c.input_ids); + EXPECT_EQ(output, c.prompt); + } } } @@ -1565,7 +1279,8 @@ TEST(Pipeline, InternLM) { "Bot|>:", {1, 333, 352, 1621, 352, 27232, 76379, 103027, 364, 333, 352, 23845, 352, 27232, 76379, 98899, 68408, 73159, 67566, 67513, 61056, 99050, 103028, 364, 333, 352, 1621, 352, - 27232, 67891, 76046, 67551, 68573, 103027, 364, 333, 352, 23845, 352, 27232}}}; + 27232, 67891, 76046, 67551, 68573, 103027, 364, 333, 352, 23845, 352, 27232}, + true}}; check_tokenizer(pipeline.tokenizer.get(), cases); }