Skip to content

Commit

Permalink
Fix nan by rescheduling attention scaling (#322)
Browse files Browse the repository at this point in the history
  • Loading branch information
li-plus authored Jun 24, 2024
1 parent e9989b5 commit f86777c
Show file tree
Hide file tree
Showing 9 changed files with 115 additions and 97 deletions.
20 changes: 11 additions & 9 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -59,13 +59,15 @@ The original model (`-i <model_name_or_path>`) can be a Hugging Face model name
* CodeGeeX2: `THUDM/codegeex2-6b`, `THUDM/codegeex2-6b-int4`

You are free to try any of the below quantization types by specifying `-t <type>`:
* `q4_0`: 4-bit integer quantization with fp16 scales.
* `q4_1`: 4-bit integer quantization with fp16 scales and minimum values.
* `q5_0`: 5-bit integer quantization with fp16 scales.
* `q5_1`: 5-bit integer quantization with fp16 scales and minimum values.
* `q8_0`: 8-bit integer quantization with fp16 scales.
* `f16`: half precision floating point weights without quantization.
* `f32`: single precision floating point weights without quantization.
| type | precision | symmetric |
| ------ | --------- | --------- |
| `q4_0` | int4 | true |
| `q4_1` | int4 | false |
| `q5_0` | int5 | true |
| `q5_1` | int5 | false |
| `q8_0` | int8 | true |
| `f16` | half | |
| `f32` | float | |

For LoRA models, add `-l <lora_model_name_or_path>` flag to merge your LoRA weights into the base model. For example, run `python3 chatglm_cpp/convert.py -i THUDM/chatglm3-6b -t q4_0 -o models/chatglm3-ggml-lora.bin -l shibing624/chatglm3-6b-csc-chinese-lora` to merge public LoRA weights from Hugging Face.

Expand Down Expand Up @@ -551,8 +553,8 @@ Download and unzip the dataset from [link](https://s3.amazonaws.com/research.met
| | Q4_0 | Q4_1 | Q5_0 | Q5_1 | Q8_0 | F16 |
|-------------------------|-------|-------|-------|-------|-------|-------|
| [ChatGLM3-6B-Base][1] | 6.215 | 6.184 | 5.997 | 6.015 | 5.965 | 5.971 |
| [ChatGLM4-9B-Base][2] | 6.851 | 6.793 | 6.652 | 6.635 | 6.582 | 6.586 |
| [ChatGLM3-6B-Base][1] | 6.215 | 6.188 | 6.006 | 6.022 | 5.971 | 5.972 |
| [ChatGLM4-9B-Base][2] | 6.834 | 6.780 | 6.645 | 6.624 | 6.576 | 6.577 |
[1]: https://huggingface.co/THUDM/chatglm3-6b-base
[2]: https://huggingface.co/THUDM/glm-4-9b
Expand Down
25 changes: 13 additions & 12 deletions chatglm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -624,7 +624,7 @@ ggml_tensor *BasicAttention::forward(ModelContext *mctx, ggml_tensor *hidden_sta
const int hidden_size = hidden_states->ne[0];
const int qlen = hidden_states->ne[1];
const int head_size = hidden_size / num_attention_heads;
const int num_shared_q_heads = num_attention_heads / num_kv_heads;
const int num_shared_q_heads = num_attention_heads / num_key_value_heads;

ggml_tensor *qkv = query_key_value.forward(mctx, hidden_states); // [sq, (#h + 2 * #kvh) * d]

Expand All @@ -645,10 +645,11 @@ ggml_tensor *BasicAttention::forward(ModelContext *mctx, ggml_tensor *hidden_sta
} else {
query_layer = ggml_view_3d(ctx, qkv, head_size, num_attention_heads, qlen, head_size * ggml_element_size(qkv),
qkv->nb[1], 0);
key_layer = ggml_view_3d(ctx, qkv, head_size, num_kv_heads, qlen, head_size * ggml_element_size(qkv),
key_layer = ggml_view_3d(ctx, qkv, head_size, num_key_value_heads, qlen, head_size * ggml_element_size(qkv),
qkv->nb[1], hidden_size * ggml_element_size(qkv));
value_layer = ggml_view_3d(ctx, qkv, head_size, num_kv_heads, qlen, head_size * ggml_element_size(qkv),
qkv->nb[1], (hidden_size + head_size * num_kv_heads) * ggml_element_size(qkv));
value_layer =
ggml_view_3d(ctx, qkv, head_size, num_key_value_heads, qlen, head_size * ggml_element_size(qkv), qkv->nb[1],
(hidden_size + head_size * num_key_value_heads) * ggml_element_size(qkv));
}

query_layer = apply_rotary_emb(mctx, query_layer, position_ids, rope_type, rope_theta);
Expand All @@ -657,33 +658,33 @@ ggml_tensor *BasicAttention::forward(ModelContext *mctx, ggml_tensor *hidden_sta
query_layer = ggml_cont(ctx, ggml_permute(ctx, query_layer, 0, 2, 1, 3)); // [#h, s, d]
if (num_shared_q_heads > 1) {
query_layer = ggml_reshape_3d(ctx, query_layer, head_size, num_shared_q_heads * qlen,
num_kv_heads); // [#kvh, (#h/#kvh) * s, d]
num_key_value_heads); // [#kvh, (#h/#kvh) * s, d]
}

key_layer = ggml_permute(ctx, key_layer, 0, 2, 1, 3); // [#kvh, s, d]
value_layer = ggml_permute(ctx, value_layer, 1, 2, 0, 3); // [#kvh, d, s]

// store key & value to cache
ggml_tensor *k_cache_view =
ggml_view_3d(ctx, k_cache, head_size, qlen, num_kv_heads, k_cache->nb[1], k_cache->nb[2],
ggml_view_3d(ctx, k_cache, head_size, qlen, num_key_value_heads, k_cache->nb[1], k_cache->nb[2],
(num_virtual_tokens + n_past) * head_size * ggml_element_size(k_cache)); // [#kvh, s, d]
ggml_build_forward_expand(mctx->gf, ggml_cpy(ctx, key_layer, k_cache_view));
ggml_tensor *v_cache_view =
ggml_view_3d(ctx, v_cache, qlen, head_size, num_kv_heads, v_cache->nb[1], v_cache->nb[2],
ggml_view_3d(ctx, v_cache, qlen, head_size, num_key_value_heads, v_cache->nb[1], v_cache->nb[2],
(num_virtual_tokens + n_past) * ggml_element_size(v_cache)); // [#kvh, d, s]
ggml_build_forward_expand(mctx->gf, ggml_cpy(ctx, value_layer, v_cache_view));

// concat key & value with past kv
key_layer = ggml_view_3d(ctx, k_cache, head_size, num_virtual_tokens + n_past + qlen, num_kv_heads, k_cache->nb[1],
k_cache->nb[2],
key_layer = ggml_view_3d(ctx, k_cache, head_size, num_virtual_tokens + n_past + qlen, num_key_value_heads,
k_cache->nb[1], k_cache->nb[2],
0); // [#kvh, kvs, d]
value_layer = ggml_view_3d(ctx, v_cache, num_virtual_tokens + n_past + qlen, head_size, num_kv_heads,
value_layer = ggml_view_3d(ctx, v_cache, num_virtual_tokens + n_past + qlen, head_size, num_key_value_heads,
v_cache->nb[1], v_cache->nb[2],
0); // [#kvh, d, kvs]

// attention
query_layer = ggml_scale_inplace(ctx, query_layer, 1.f / std::sqrt(head_size));
ggml_tensor *attn_scores = ggml_mul_mat(ctx, key_layer, query_layer); // [#kvh, (#h/#kvh) * s, kvs]
attn_scores = ggml_scale_inplace(ctx, attn_scores, 1.f / std::sqrt(head_size));

if (n_past == 0) {
// build attention mask for context input
Expand All @@ -701,7 +702,7 @@ ggml_tensor *BasicAttention::forward(ModelContext *mctx, ggml_tensor *hidden_sta
if (num_shared_q_heads > 1) {
attn_scores =
ggml_reshape_3d(ctx, attn_scores, num_virtual_tokens + n_past + qlen, num_shared_q_heads * qlen,
num_kv_heads); // [#kvh, (#h/#kvh) * s, kvs]
num_key_value_heads); // [#kvh, (#h/#kvh) * s, kvs]
}
}

Expand Down
99 changes: 60 additions & 39 deletions chatglm.h
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ struct ConfigRecordV1 {

// For compatibility
struct ConfigRecordV1GQA : public ConfigRecordV1 {
int num_kv_heads;
int num_key_value_heads;
};

// TODO: use json to serialize config
Expand Down Expand Up @@ -109,15 +109,15 @@ class ModelConfig {
ModelConfig() = default;

ModelConfig(ModelType model_type, ggml_type dtype, int vocab_size, int hidden_size, int num_attention_heads,
int num_kv_heads, int num_hidden_layers, int intermediate_size, float norm_eps, float rope_theta,
int num_key_value_heads, int num_hidden_layers, int intermediate_size, float norm_eps, float rope_theta,
int num_virtual_tokens, int max_length, int bos_token_id, int eos_token_id, int pad_token_id,
int sep_token_id, std::vector<int> extra_eos_token_ids)
: model_type(model_type), dtype(dtype), vocab_size(vocab_size), hidden_size(hidden_size),
num_attention_heads(num_attention_heads), num_kv_heads(num_kv_heads), num_hidden_layers(num_hidden_layers),
intermediate_size(intermediate_size), norm_eps(norm_eps), rope_theta(rope_theta),
num_virtual_tokens(num_virtual_tokens), max_length(max_length), bos_token_id(bos_token_id),
eos_token_id(eos_token_id), pad_token_id(pad_token_id), sep_token_id(sep_token_id),
extra_eos_token_ids(std::move(extra_eos_token_ids)) {
num_attention_heads(num_attention_heads), num_key_value_heads(num_key_value_heads),
num_hidden_layers(num_hidden_layers), intermediate_size(intermediate_size), norm_eps(norm_eps),
rope_theta(rope_theta), num_virtual_tokens(num_virtual_tokens), max_length(max_length),
bos_token_id(bos_token_id), eos_token_id(eos_token_id), pad_token_id(pad_token_id),
sep_token_id(sep_token_id), extra_eos_token_ids(std::move(extra_eos_token_ids)) {
if (model_type == ModelType::CHATGLM) {
hidden_act = ActivationType::GELU;
use_qkv_bias = true;
Expand Down Expand Up @@ -146,9 +146,10 @@ class ModelConfig {

ModelConfig(ModelType model_type, const ConfigRecordV1GQA &rec, float norm_eps, float rope_theta,
int num_virtual_tokens)
: ModelConfig(model_type, rec.dtype, rec.vocab_size, rec.hidden_size, rec.num_attention_heads, rec.num_kv_heads,
rec.num_hidden_layers, rec.intermediate_size, norm_eps, rope_theta, num_virtual_tokens,
rec.max_length, rec.bos_token_id, rec.eos_token_id, rec.pad_token_id, rec.sep_token_id, {}) {}
: ModelConfig(model_type, rec.dtype, rec.vocab_size, rec.hidden_size, rec.num_attention_heads,
rec.num_key_value_heads, rec.num_hidden_layers, rec.intermediate_size, norm_eps, rope_theta,
num_virtual_tokens, rec.max_length, rec.bos_token_id, rec.eos_token_id, rec.pad_token_id,
rec.sep_token_id, {}) {}

ModelConfig(ModelType model_type, const ConfigRecordV2 &rec)
: ModelConfig(model_type, rec.dtype, rec.vocab_size, rec.hidden_size, rec.num_attention_heads,
Expand All @@ -158,13 +159,33 @@ class ModelConfig {

std::string model_type_name() const { return to_string(model_type); }

friend std::ostream &operator<<(std::ostream &os, const ModelConfig &self) {
os << "ModelConfig(model_type=" << (int)self.model_type << ", dtype=" << self.dtype
<< ", vocab_size=" << self.vocab_size << ", hidden_size=" << self.hidden_size
<< ", num_attention_heads=" << self.num_attention_heads
<< ", num_key_value_heads=" << self.num_key_value_heads << ", num_hidden_layers=" << self.num_hidden_layers
<< ", intermediate_size=" << self.intermediate_size << ", norm_eps=" << self.norm_eps
<< ", hidden_act=" << (int)self.hidden_act << ", use_qkv_bias=" << self.use_qkv_bias
<< ", use_dense_bias=" << self.use_dense_bias << ", interleaved_qkv=" << self.interleaved_qkv
<< ", tie_word_embeddings=" << self.tie_word_embeddings << ", rope_type=" << (int)self.rope_type
<< ", rope_theta=" << self.rope_theta << ", attn_mask_type=" << (int)self.attn_mask_type
<< ", num_virtual_tokens=" << self.num_virtual_tokens << ", max_length=" << self.max_length
<< ", bos_token_id=" << self.bos_token_id << ", eos_token_id=" << self.eos_token_id
<< ", pad_token_id=" << self.pad_token_id << ", sep_token_id=" << self.sep_token_id
<< ", extra_eos_token_ids={";
for (size_t i = 0; i < self.extra_eos_token_ids.size(); i++) {
os << (i > 0 ? ", " : "") << self.extra_eos_token_ids[i];
}
return os << "})";
}

public:
ModelType model_type;
ggml_type dtype;
int vocab_size;
int hidden_size;
int num_attention_heads;
int num_kv_heads;
int num_key_value_heads;
int num_hidden_layers;
int intermediate_size;
float norm_eps;
Expand Down Expand Up @@ -419,26 +440,26 @@ class BasicGLU {
class BasicAttention {
public:
BasicAttention() = default;
BasicAttention(ModelContext *mctx, int hidden_size, int num_attention_heads, int num_kv_heads, int max_length,
bool use_qkv_bias, bool use_dense_bias, bool interleaved_qkv, RopeType rope_type, float rope_theta,
AttentionMaskType attn_mask_type, int num_virtual_tokens)
: num_attention_heads(num_attention_heads), num_kv_heads(num_kv_heads), interleaved_qkv(interleaved_qkv),
rope_type(rope_type), rope_theta(rope_theta), attn_mask_type(attn_mask_type),
num_virtual_tokens(num_virtual_tokens),
query_key_value(mctx, hidden_size, hidden_size + 2 * (hidden_size / num_attention_heads) * num_kv_heads,
use_qkv_bias),
BasicAttention(ModelContext *mctx, int hidden_size, int num_attention_heads, int num_key_value_heads,
int max_length, bool use_qkv_bias, bool use_dense_bias, bool interleaved_qkv, RopeType rope_type,
float rope_theta, AttentionMaskType attn_mask_type, int num_virtual_tokens)
: num_attention_heads(num_attention_heads), num_key_value_heads(num_key_value_heads),
interleaved_qkv(interleaved_qkv), rope_type(rope_type), rope_theta(rope_theta),
attn_mask_type(attn_mask_type), num_virtual_tokens(num_virtual_tokens),
query_key_value(mctx, hidden_size,
hidden_size + 2 * (hidden_size / num_attention_heads) * num_key_value_heads, use_qkv_bias),
dense(mctx, hidden_size, hidden_size, use_dense_bias),
k_cache(ggml_new_tensor_3d(mctx->ctx_kv.get(), GGML_TYPE_F16, hidden_size / num_attention_heads,
max_length + num_virtual_tokens, num_kv_heads)),
max_length + num_virtual_tokens, num_key_value_heads)),
v_cache(ggml_new_tensor_3d(mctx->ctx_kv.get(), GGML_TYPE_F16, max_length + num_virtual_tokens,
hidden_size / num_attention_heads, num_kv_heads)) {}
hidden_size / num_attention_heads, num_key_value_heads)) {}

ggml_tensor *forward(ModelContext *mctx, ggml_tensor *hidden_states, ggml_tensor *attention_mask,
ggml_tensor *position_ids, int n_past) const;

public:
int num_attention_heads;
int num_kv_heads;
int num_key_value_heads;
bool interleaved_qkv;
RopeType rope_type;
float rope_theta;
Expand All @@ -454,13 +475,13 @@ template <typename Norm, typename MLP>
class BasicBlock {
public:
BasicBlock() = default;
BasicBlock(ModelContext *mctx, int hidden_size, int num_attention_heads, int num_kv_heads, int intermediate_size,
int max_length, float norm_eps, ActivationType hidden_act, bool use_qkv_bias, bool use_dense_bias,
bool interleaved_qkv, RopeType rope_type, float rope_theta, AttentionMaskType attn_mask_type,
int num_virtual_tokens)
BasicBlock(ModelContext *mctx, int hidden_size, int num_attention_heads, int num_key_value_heads,
int intermediate_size, int max_length, float norm_eps, ActivationType hidden_act, bool use_qkv_bias,
bool use_dense_bias, bool interleaved_qkv, RopeType rope_type, float rope_theta,
AttentionMaskType attn_mask_type, int num_virtual_tokens)
: input_layernorm(mctx, hidden_size, false, norm_eps),
attention(mctx, hidden_size, num_attention_heads, num_kv_heads, max_length, use_qkv_bias, use_dense_bias,
interleaved_qkv, rope_type, rope_theta, attn_mask_type, num_virtual_tokens),
attention(mctx, hidden_size, num_attention_heads, num_key_value_heads, max_length, use_qkv_bias,
use_dense_bias, interleaved_qkv, rope_type, rope_theta, attn_mask_type, num_virtual_tokens),
post_attention_layernorm(mctx, hidden_size, false, norm_eps),
mlp(mctx, hidden_size, intermediate_size, hidden_act) {}

Expand Down Expand Up @@ -572,20 +593,20 @@ class BasicModel {
auto &attn = layers[i].attention;
ggml_tensor *virtual_key =
ggml_view_3d(mctx.ctx_b.get(), past_key_values, head_size, config.num_virtual_tokens,
config.num_kv_heads, past_key_values->nb[1], past_key_values->nb[2],
config.num_key_value_heads, past_key_values->nb[1], past_key_values->nb[2],
i * 2 * past_key_values->nb[3]); // [#h, v, d]
ggml_tensor *k_cache_view =
ggml_view_3d(mctx.ctx_b.get(), attn.k_cache, head_size, config.num_virtual_tokens, config.num_kv_heads,
attn.k_cache->nb[1], attn.k_cache->nb[2], 0); // [#h, v, d]
ggml_view_3d(mctx.ctx_b.get(), attn.k_cache, head_size, config.num_virtual_tokens,
config.num_key_value_heads, attn.k_cache->nb[1], attn.k_cache->nb[2], 0); // [#h, v, d]
ggml_build_forward_expand(mctx.gf, ggml_cpy(mctx.ctx_b.get(), virtual_key, k_cache_view));

ggml_tensor *virtual_value = ggml_view_3d(
mctx.ctx_b.get(), past_key_values, head_size, config.num_virtual_tokens, config.num_kv_heads,
mctx.ctx_b.get(), past_key_values, head_size, config.num_virtual_tokens, config.num_key_value_heads,
past_key_values->nb[1], past_key_values->nb[2], (i * 2 + 1) * past_key_values->nb[3]); // [#h, v, d]
virtual_value = ggml_permute(mctx.ctx_b.get(), virtual_value, 1, 0, 2, 3); // [#h, d, v]
ggml_tensor *v_cache_view =
ggml_view_3d(mctx.ctx_b.get(), attn.v_cache, config.num_virtual_tokens, head_size, config.num_kv_heads,
attn.v_cache->nb[1], attn.v_cache->nb[2], 0); // [#h, d, v]
ggml_view_3d(mctx.ctx_b.get(), attn.v_cache, config.num_virtual_tokens, head_size,
config.num_key_value_heads, attn.v_cache->nb[1], attn.v_cache->nb[2], 0); // [#h, d, v]
ggml_build_forward_expand(mctx.gf, ggml_cpy(mctx.ctx_b.get(), virtual_value, v_cache_view));
}

Expand All @@ -598,7 +619,7 @@ class BasicModel {
std::vector<Block> layers;
layers.reserve(config.num_hidden_layers);
for (int layer_id = 0; layer_id < config.num_hidden_layers; layer_id++) {
layers.emplace_back(mctx, config.hidden_size, config.num_attention_heads, config.num_kv_heads,
layers.emplace_back(mctx, config.hidden_size, config.num_attention_heads, config.num_key_value_heads,
config.intermediate_size, config.max_length, config.norm_eps, config.hidden_act,
config.use_qkv_bias, config.use_dense_bias, config.interleaved_qkv, config.rope_type,
config.rope_theta, config.attn_mask_type, config.num_virtual_tokens);
Expand Down Expand Up @@ -858,10 +879,10 @@ class ChatGLMTokenizer : public BaseTokenizer {
class GLMBlock : public BasicBlock<LayerNorm, BasicMLP> {
public:
GLMBlock() = default;
GLMBlock(ModelContext *mctx, int hidden_size, int num_attention_heads, int num_kv_heads, int intermediate_size,
int max_length, float norm_eps, ActivationType hidden_act, bool use_qkv_bias, bool use_dense_bias,
bool interleaved_qkv, RopeType rope_type, float rope_theta, AttentionMaskType attn_mask_type,
int num_virtual_tokens)
GLMBlock(ModelContext *mctx, int hidden_size, int num_attention_heads, int num_key_value_heads,
int intermediate_size, int max_length, float norm_eps, ActivationType hidden_act, bool use_qkv_bias,
bool use_dense_bias, bool interleaved_qkv, RopeType rope_type, float rope_theta,
AttentionMaskType attn_mask_type, int num_virtual_tokens)
: BasicBlock(LayerNorm(mctx, hidden_size, false, norm_eps),
BasicAttention(mctx, hidden_size, num_attention_heads, num_attention_heads, max_length,
use_qkv_bias, use_dense_bias, interleaved_qkv, rope_type, rope_theta,
Expand Down
2 changes: 1 addition & 1 deletion chatglm_cpp/_C.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ class ModelConfig:
def num_hidden_layers(self) -> int:
...
@property
def num_kv_heads(self) -> int:
def num_key_value_heads(self) -> int:
...
@property
def pad_token_id(self) -> int:
Expand Down
Loading

0 comments on commit f86777c

Please sign in to comment.