diff --git a/.gitignore b/.gitignore index 419d53da..1a428ef6 100755 --- a/.gitignore +++ b/.gitignore @@ -166,4 +166,4 @@ checkpoints/ *.txt pipeline/serve/deploy/otterhd_endpoint.py pipeline/benchmarks/models/llava_model.py -eval_results/ +# eval_results/ diff --git a/eval_results/eval_results_llava b/eval_results/eval_results_llava new file mode 100644 index 00000000..fd481734 --- /dev/null +++ b/eval_results/eval_results_llava @@ -0,0 +1,36 @@ +================================================================================ + EVALUATION REPORT +================================================================================ + + +MODEL INFO: {'name': 'llava_model', 'model_path': '/mnt/petrelfs/zhangyuanhan/LLaVA/checkpoints/llava-v1.5-7b'} +-------------------------------------------------------------------------------- +[2023-12-21 10:25:49,407] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect) +Imported class: +Imported class: + +DATASET: MMEDataset +-------------------- +=========== Cognition =========== +total score: 250.0 + code_reasoning score: 55.0 + numerical_calculation score: 35.0 + text_translation score: 50.0 + commonsense_reasoning score: 110.0 +=========== Perception =========== +total score: 1484.87775110044 + artwork score: 125.75 + celebrity score: 129.41176470588235 + count score: 153.33333333333334 + color score: 165.0 + position score: 118.33333333333334 + OCR score: 132.5 + landmark score: 160.0 + scene score: 157.25 + existence score: 195.0 + posters score: 148.29931972789115 + +-------------------------------------------------------------------------------- +Total Datasets Evaluated: 1 + +================================================================================ diff --git a/eval_results/eval_results_otter b/eval_results/eval_results_otter new file mode 100644 index 00000000..92e93e15 --- /dev/null +++ b/eval_results/eval_results_otter @@ -0,0 +1,197 @@ +================================================================================ + EVALUATION REPORT +================================================================================ + + +MODEL INFO: {'name': 'otter_image', 'model_path': '/mnt/petrelfs/zhangyuanhan/Otter/checkpoints/otter_llava_sft_nonconv_nogroup/epoch_1/'} +-------------------------------------------------------------------------------- +[2023-12-23 08:32:08,024] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect) +Imported class: +The current model version is configured for Otter-Image with max_num_frames set to None. +Parameter: lang_encoder.model.embed_tokens.weight, Size: 131.084288 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.attn_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.ff_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.attn.norm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.attn.norm.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.attn.to_q.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.attn.to_kv.weight, Size: 1.048576 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.attn.to_out.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.feed_forward.0.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.feed_forward.0.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.feed_forward.1.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.feed_forward.3.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.attn_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.ff_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.attn.norm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.attn.norm.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.attn.to_q.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.attn.to_kv.weight, Size: 1.048576 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.attn.to_out.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.feed_forward.0.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.feed_forward.0.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.feed_forward.1.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.feed_forward.3.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.attn_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.ff_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.attn.norm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.attn.norm.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.attn.to_q.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.attn.to_kv.weight, Size: 1.048576 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.attn.to_out.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.feed_forward.0.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.feed_forward.0.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.feed_forward.1.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.feed_forward.3.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.attn_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.ff_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.attn.norm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.attn.norm.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.attn.to_q.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.attn.to_kv.weight, Size: 1.048576 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.attn.to_out.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.feed_forward.0.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.feed_forward.0.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.feed_forward.1.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.feed_forward.3.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.attn_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.ff_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.attn.norm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.attn.norm.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.attn.to_q.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.attn.to_kv.weight, Size: 1.048576 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.attn.to_out.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.feed_forward.0.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.feed_forward.0.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.feed_forward.1.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.feed_forward.3.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.attn_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.ff_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.attn.norm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.attn.norm.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.attn.to_q.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.attn.to_kv.weight, Size: 1.048576 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.attn.to_out.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.feed_forward.0.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.feed_forward.0.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.feed_forward.1.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.feed_forward.3.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.attn_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.ff_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.attn.norm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.attn.norm.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.attn.to_q.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.attn.to_kv.weight, Size: 1.048576 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.attn.to_out.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.feed_forward.0.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.feed_forward.0.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.feed_forward.1.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.feed_forward.3.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.attn_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.ff_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.attn.norm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.attn.norm.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.attn.to_q.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.attn.to_kv.weight, Size: 1.048576 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.attn.to_out.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.feed_forward.0.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.feed_forward.0.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.feed_forward.1.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.feed_forward.3.weight, Size: 67.108864 M +Parameter: lang_encoder.lm_head.weight, Size: 131.084288 M +Parameter: perceiver.latents, Size: 0.065536 M +Parameter: perceiver.layers.0.norm_media.weight, Size: 0.001024 M +Parameter: perceiver.layers.0.norm_media.bias, Size: 0.001024 M +Parameter: perceiver.layers.0.norm_latents.weight, Size: 0.001024 M +Parameter: perceiver.layers.0.norm_latents.bias, Size: 0.001024 M +Parameter: perceiver.layers.0.to_q.weight, Size: 0.524288 M +Parameter: perceiver.layers.0.to_kv.weight, Size: 1.048576 M +Parameter: perceiver.layers.0.to_out.weight, Size: 0.524288 M +Parameter: perceiver.layers.0.feed_forward.0.weight, Size: 0.001024 M +Parameter: perceiver.layers.0.feed_forward.0.bias, Size: 0.001024 M +Parameter: perceiver.layers.0.feed_forward.1.weight, Size: 4.194304 M +Parameter: perceiver.layers.0.feed_forward.3.weight, Size: 4.194304 M +Parameter: perceiver.layers.1.norm_media.weight, Size: 0.001024 M +Parameter: perceiver.layers.1.norm_media.bias, Size: 0.001024 M +Parameter: perceiver.layers.1.norm_latents.weight, Size: 0.001024 M +Parameter: perceiver.layers.1.norm_latents.bias, Size: 0.001024 M +Parameter: perceiver.layers.1.to_q.weight, Size: 0.524288 M +Parameter: perceiver.layers.1.to_kv.weight, Size: 1.048576 M +Parameter: perceiver.layers.1.to_out.weight, Size: 0.524288 M +Parameter: perceiver.layers.1.feed_forward.0.weight, Size: 0.001024 M +Parameter: perceiver.layers.1.feed_forward.0.bias, Size: 0.001024 M +Parameter: perceiver.layers.1.feed_forward.1.weight, Size: 4.194304 M +Parameter: perceiver.layers.1.feed_forward.3.weight, Size: 4.194304 M +Parameter: perceiver.layers.2.norm_media.weight, Size: 0.001024 M +Parameter: perceiver.layers.2.norm_media.bias, Size: 0.001024 M +Parameter: perceiver.layers.2.norm_latents.weight, Size: 0.001024 M +Parameter: perceiver.layers.2.norm_latents.bias, Size: 0.001024 M +Parameter: perceiver.layers.2.to_q.weight, Size: 0.524288 M +Parameter: perceiver.layers.2.to_kv.weight, Size: 1.048576 M +Parameter: perceiver.layers.2.to_out.weight, Size: 0.524288 M +Parameter: perceiver.layers.2.feed_forward.0.weight, Size: 0.001024 M +Parameter: perceiver.layers.2.feed_forward.0.bias, Size: 0.001024 M +Parameter: perceiver.layers.2.feed_forward.1.weight, Size: 4.194304 M +Parameter: perceiver.layers.2.feed_forward.3.weight, Size: 4.194304 M +Parameter: perceiver.layers.3.norm_media.weight, Size: 0.001024 M +Parameter: perceiver.layers.3.norm_media.bias, Size: 0.001024 M +Parameter: perceiver.layers.3.norm_latents.weight, Size: 0.001024 M +Parameter: perceiver.layers.3.norm_latents.bias, Size: 0.001024 M +Parameter: perceiver.layers.3.to_q.weight, Size: 0.524288 M +Parameter: perceiver.layers.3.to_kv.weight, Size: 1.048576 M +Parameter: perceiver.layers.3.to_out.weight, Size: 0.524288 M +Parameter: perceiver.layers.3.feed_forward.0.weight, Size: 0.001024 M +Parameter: perceiver.layers.3.feed_forward.0.bias, Size: 0.001024 M +Parameter: perceiver.layers.3.feed_forward.1.weight, Size: 4.194304 M +Parameter: perceiver.layers.3.feed_forward.3.weight, Size: 4.194304 M +Parameter: perceiver.layers.4.norm_media.weight, Size: 0.001024 M +Parameter: perceiver.layers.4.norm_media.bias, Size: 0.001024 M +Parameter: perceiver.layers.4.norm_latents.weight, Size: 0.001024 M +Parameter: perceiver.layers.4.norm_latents.bias, Size: 0.001024 M +Parameter: perceiver.layers.4.to_q.weight, Size: 0.524288 M +Parameter: perceiver.layers.4.to_kv.weight, Size: 1.048576 M +Parameter: perceiver.layers.4.to_out.weight, Size: 0.524288 M +Parameter: perceiver.layers.4.feed_forward.0.weight, Size: 0.001024 M +Parameter: perceiver.layers.4.feed_forward.0.bias, Size: 0.001024 M +Parameter: perceiver.layers.4.feed_forward.1.weight, Size: 4.194304 M +Parameter: perceiver.layers.4.feed_forward.3.weight, Size: 4.194304 M +Parameter: perceiver.layers.5.norm_media.weight, Size: 0.001024 M +Parameter: perceiver.layers.5.norm_media.bias, Size: 0.001024 M +Parameter: perceiver.layers.5.norm_latents.weight, Size: 0.001024 M +Parameter: perceiver.layers.5.norm_latents.bias, Size: 0.001024 M +Parameter: perceiver.layers.5.to_q.weight, Size: 0.524288 M +Parameter: perceiver.layers.5.to_kv.weight, Size: 1.048576 M +Parameter: perceiver.layers.5.to_out.weight, Size: 0.524288 M +Parameter: perceiver.layers.5.feed_forward.0.weight, Size: 0.001024 M +Parameter: perceiver.layers.5.feed_forward.0.bias, Size: 0.001024 M +Parameter: perceiver.layers.5.feed_forward.1.weight, Size: 4.194304 M +Parameter: perceiver.layers.5.feed_forward.3.weight, Size: 4.194304 M +Parameter: perceiver.norm.weight, Size: 0.001024 M +Parameter: perceiver.norm.bias, Size: 0.001024 M +Total Trainable param: 1.441004 B +Imported class: + +DATASET: MMEDataset +-------------------- +=========== Cognition =========== +total score: 295.3571428571429 + code_reasoning score: 50.0 + numerical_calculation score: 80.0 + text_translation score: 72.5 + commonsense_reasoning score: 92.85714285714286 +=========== Perception =========== +total score: 902.483993597439 + artwork score: 58.0 + celebrity score: 67.05882352941177 + count score: 121.66666666666666 + color score: 55.00000000000001 + position score: 50.0 + OCR score: 50.0 + landmark score: 119.25 + scene score: 155.25 + existence score: 163.33333333333334 + posters score: 62.925170068027214 + +-------------------------------------------------------------------------------- +Total Datasets Evaluated: 1 + +================================================================================ diff --git a/eval_results/eval_results_otter_2e-3 b/eval_results/eval_results_otter_2e-3 new file mode 100644 index 00000000..6e4de9ad --- /dev/null +++ b/eval_results/eval_results_otter_2e-3 @@ -0,0 +1,197 @@ +================================================================================ + EVALUATION REPORT +================================================================================ + + +MODEL INFO: {'name': 'otter_image', 'model_path': '/mnt/petrelfs/zhangyuanhan/Otter/checkpoints/ootter_llava_sft_2e-3/epoch_1/'} +-------------------------------------------------------------------------------- +[2023-12-25 06:47:14,228] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect) +Imported class: +The current model version is configured for Otter-Image with max_num_frames set to None. +Parameter: lang_encoder.model.embed_tokens.weight, Size: 131.084288 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.attn_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.ff_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.attn.norm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.attn.norm.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.attn.to_q.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.attn.to_kv.weight, Size: 1.048576 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.attn.to_out.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.feed_forward.0.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.feed_forward.0.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.feed_forward.1.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.feed_forward.3.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.attn_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.ff_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.attn.norm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.attn.norm.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.attn.to_q.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.attn.to_kv.weight, Size: 1.048576 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.attn.to_out.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.feed_forward.0.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.feed_forward.0.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.feed_forward.1.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.feed_forward.3.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.attn_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.ff_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.attn.norm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.attn.norm.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.attn.to_q.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.attn.to_kv.weight, Size: 1.048576 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.attn.to_out.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.feed_forward.0.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.feed_forward.0.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.feed_forward.1.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.feed_forward.3.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.attn_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.ff_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.attn.norm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.attn.norm.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.attn.to_q.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.attn.to_kv.weight, Size: 1.048576 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.attn.to_out.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.feed_forward.0.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.feed_forward.0.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.feed_forward.1.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.feed_forward.3.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.attn_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.ff_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.attn.norm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.attn.norm.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.attn.to_q.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.attn.to_kv.weight, Size: 1.048576 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.attn.to_out.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.feed_forward.0.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.feed_forward.0.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.feed_forward.1.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.feed_forward.3.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.attn_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.ff_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.attn.norm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.attn.norm.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.attn.to_q.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.attn.to_kv.weight, Size: 1.048576 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.attn.to_out.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.feed_forward.0.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.feed_forward.0.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.feed_forward.1.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.feed_forward.3.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.attn_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.ff_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.attn.norm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.attn.norm.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.attn.to_q.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.attn.to_kv.weight, Size: 1.048576 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.attn.to_out.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.feed_forward.0.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.feed_forward.0.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.feed_forward.1.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.feed_forward.3.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.attn_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.ff_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.attn.norm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.attn.norm.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.attn.to_q.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.attn.to_kv.weight, Size: 1.048576 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.attn.to_out.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.feed_forward.0.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.feed_forward.0.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.feed_forward.1.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.feed_forward.3.weight, Size: 67.108864 M +Parameter: lang_encoder.lm_head.weight, Size: 131.084288 M +Parameter: perceiver.latents, Size: 0.065536 M +Parameter: perceiver.layers.0.norm_media.weight, Size: 0.001024 M +Parameter: perceiver.layers.0.norm_media.bias, Size: 0.001024 M +Parameter: perceiver.layers.0.norm_latents.weight, Size: 0.001024 M +Parameter: perceiver.layers.0.norm_latents.bias, Size: 0.001024 M +Parameter: perceiver.layers.0.to_q.weight, Size: 0.524288 M +Parameter: perceiver.layers.0.to_kv.weight, Size: 1.048576 M +Parameter: perceiver.layers.0.to_out.weight, Size: 0.524288 M +Parameter: perceiver.layers.0.feed_forward.0.weight, Size: 0.001024 M +Parameter: perceiver.layers.0.feed_forward.0.bias, Size: 0.001024 M +Parameter: perceiver.layers.0.feed_forward.1.weight, Size: 4.194304 M +Parameter: perceiver.layers.0.feed_forward.3.weight, Size: 4.194304 M +Parameter: perceiver.layers.1.norm_media.weight, Size: 0.001024 M +Parameter: perceiver.layers.1.norm_media.bias, Size: 0.001024 M +Parameter: perceiver.layers.1.norm_latents.weight, Size: 0.001024 M +Parameter: perceiver.layers.1.norm_latents.bias, Size: 0.001024 M +Parameter: perceiver.layers.1.to_q.weight, Size: 0.524288 M +Parameter: perceiver.layers.1.to_kv.weight, Size: 1.048576 M +Parameter: perceiver.layers.1.to_out.weight, Size: 0.524288 M +Parameter: perceiver.layers.1.feed_forward.0.weight, Size: 0.001024 M +Parameter: perceiver.layers.1.feed_forward.0.bias, Size: 0.001024 M +Parameter: perceiver.layers.1.feed_forward.1.weight, Size: 4.194304 M +Parameter: perceiver.layers.1.feed_forward.3.weight, Size: 4.194304 M +Parameter: perceiver.layers.2.norm_media.weight, Size: 0.001024 M +Parameter: perceiver.layers.2.norm_media.bias, Size: 0.001024 M +Parameter: perceiver.layers.2.norm_latents.weight, Size: 0.001024 M +Parameter: perceiver.layers.2.norm_latents.bias, Size: 0.001024 M +Parameter: perceiver.layers.2.to_q.weight, Size: 0.524288 M +Parameter: perceiver.layers.2.to_kv.weight, Size: 1.048576 M +Parameter: perceiver.layers.2.to_out.weight, Size: 0.524288 M +Parameter: perceiver.layers.2.feed_forward.0.weight, Size: 0.001024 M +Parameter: perceiver.layers.2.feed_forward.0.bias, Size: 0.001024 M +Parameter: perceiver.layers.2.feed_forward.1.weight, Size: 4.194304 M +Parameter: perceiver.layers.2.feed_forward.3.weight, Size: 4.194304 M +Parameter: perceiver.layers.3.norm_media.weight, Size: 0.001024 M +Parameter: perceiver.layers.3.norm_media.bias, Size: 0.001024 M +Parameter: perceiver.layers.3.norm_latents.weight, Size: 0.001024 M +Parameter: perceiver.layers.3.norm_latents.bias, Size: 0.001024 M +Parameter: perceiver.layers.3.to_q.weight, Size: 0.524288 M +Parameter: perceiver.layers.3.to_kv.weight, Size: 1.048576 M +Parameter: perceiver.layers.3.to_out.weight, Size: 0.524288 M +Parameter: perceiver.layers.3.feed_forward.0.weight, Size: 0.001024 M +Parameter: perceiver.layers.3.feed_forward.0.bias, Size: 0.001024 M +Parameter: perceiver.layers.3.feed_forward.1.weight, Size: 4.194304 M +Parameter: perceiver.layers.3.feed_forward.3.weight, Size: 4.194304 M +Parameter: perceiver.layers.4.norm_media.weight, Size: 0.001024 M +Parameter: perceiver.layers.4.norm_media.bias, Size: 0.001024 M +Parameter: perceiver.layers.4.norm_latents.weight, Size: 0.001024 M +Parameter: perceiver.layers.4.norm_latents.bias, Size: 0.001024 M +Parameter: perceiver.layers.4.to_q.weight, Size: 0.524288 M +Parameter: perceiver.layers.4.to_kv.weight, Size: 1.048576 M +Parameter: perceiver.layers.4.to_out.weight, Size: 0.524288 M +Parameter: perceiver.layers.4.feed_forward.0.weight, Size: 0.001024 M +Parameter: perceiver.layers.4.feed_forward.0.bias, Size: 0.001024 M +Parameter: perceiver.layers.4.feed_forward.1.weight, Size: 4.194304 M +Parameter: perceiver.layers.4.feed_forward.3.weight, Size: 4.194304 M +Parameter: perceiver.layers.5.norm_media.weight, Size: 0.001024 M +Parameter: perceiver.layers.5.norm_media.bias, Size: 0.001024 M +Parameter: perceiver.layers.5.norm_latents.weight, Size: 0.001024 M +Parameter: perceiver.layers.5.norm_latents.bias, Size: 0.001024 M +Parameter: perceiver.layers.5.to_q.weight, Size: 0.524288 M +Parameter: perceiver.layers.5.to_kv.weight, Size: 1.048576 M +Parameter: perceiver.layers.5.to_out.weight, Size: 0.524288 M +Parameter: perceiver.layers.5.feed_forward.0.weight, Size: 0.001024 M +Parameter: perceiver.layers.5.feed_forward.0.bias, Size: 0.001024 M +Parameter: perceiver.layers.5.feed_forward.1.weight, Size: 4.194304 M +Parameter: perceiver.layers.5.feed_forward.3.weight, Size: 4.194304 M +Parameter: perceiver.norm.weight, Size: 0.001024 M +Parameter: perceiver.norm.bias, Size: 0.001024 M +Total Trainable param: 1.441004 B +Imported class: + +DATASET: MMEDataset +-------------------- +=========== Cognition =========== +total score: 150.35714285714286 + code_reasoning score: 50.0 + numerical_calculation score: 40.0 + text_translation score: 27.500000000000004 + commonsense_reasoning score: 32.857142857142854 +=========== Perception =========== +total score: 418.11954781912766 + artwork score: 36.0 + celebrity score: 48.529411764705884 + count score: 41.66666666666667 + color score: 50.0 + position score: 50.0 + OCR score: 20.0 + landmark score: 46.0 + scene score: 42.25 + existence score: 50.0 + posters score: 33.6734693877551 + +-------------------------------------------------------------------------------- +Total Datasets Evaluated: 1 + +================================================================================ diff --git a/eval_results/eval_results_otter_2e-4 b/eval_results/eval_results_otter_2e-4 new file mode 100644 index 00000000..931902c1 --- /dev/null +++ b/eval_results/eval_results_otter_2e-4 @@ -0,0 +1,197 @@ +================================================================================ + EVALUATION REPORT +================================================================================ + + +MODEL INFO: {'name': 'otter_image', 'model_path': '/mnt/petrelfs/zhangyuanhan/Otter/checkpoints/otter_llava_direct_sft_2e-4/epoch_1/'} +-------------------------------------------------------------------------------- +[2023-12-24 06:29:03,665] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect) +Imported class: +The current model version is configured for Otter-Image with max_num_frames set to None. +Parameter: lang_encoder.model.embed_tokens.weight, Size: 131.084288 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.attn_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.ff_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.attn.norm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.attn.norm.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.attn.to_q.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.attn.to_kv.weight, Size: 1.048576 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.attn.to_out.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.feed_forward.0.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.feed_forward.0.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.feed_forward.1.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.feed_forward.3.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.attn_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.ff_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.attn.norm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.attn.norm.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.attn.to_q.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.attn.to_kv.weight, Size: 1.048576 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.attn.to_out.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.feed_forward.0.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.feed_forward.0.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.feed_forward.1.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.feed_forward.3.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.attn_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.ff_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.attn.norm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.attn.norm.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.attn.to_q.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.attn.to_kv.weight, Size: 1.048576 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.attn.to_out.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.feed_forward.0.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.feed_forward.0.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.feed_forward.1.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.feed_forward.3.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.attn_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.ff_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.attn.norm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.attn.norm.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.attn.to_q.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.attn.to_kv.weight, Size: 1.048576 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.attn.to_out.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.feed_forward.0.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.feed_forward.0.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.feed_forward.1.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.feed_forward.3.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.attn_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.ff_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.attn.norm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.attn.norm.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.attn.to_q.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.attn.to_kv.weight, Size: 1.048576 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.attn.to_out.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.feed_forward.0.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.feed_forward.0.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.feed_forward.1.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.feed_forward.3.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.attn_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.ff_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.attn.norm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.attn.norm.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.attn.to_q.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.attn.to_kv.weight, Size: 1.048576 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.attn.to_out.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.feed_forward.0.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.feed_forward.0.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.feed_forward.1.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.feed_forward.3.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.attn_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.ff_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.attn.norm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.attn.norm.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.attn.to_q.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.attn.to_kv.weight, Size: 1.048576 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.attn.to_out.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.feed_forward.0.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.feed_forward.0.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.feed_forward.1.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.feed_forward.3.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.attn_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.ff_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.attn.norm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.attn.norm.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.attn.to_q.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.attn.to_kv.weight, Size: 1.048576 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.attn.to_out.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.feed_forward.0.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.feed_forward.0.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.feed_forward.1.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.feed_forward.3.weight, Size: 67.108864 M +Parameter: lang_encoder.lm_head.weight, Size: 131.084288 M +Parameter: perceiver.latents, Size: 0.065536 M +Parameter: perceiver.layers.0.norm_media.weight, Size: 0.001024 M +Parameter: perceiver.layers.0.norm_media.bias, Size: 0.001024 M +Parameter: perceiver.layers.0.norm_latents.weight, Size: 0.001024 M +Parameter: perceiver.layers.0.norm_latents.bias, Size: 0.001024 M +Parameter: perceiver.layers.0.to_q.weight, Size: 0.524288 M +Parameter: perceiver.layers.0.to_kv.weight, Size: 1.048576 M +Parameter: perceiver.layers.0.to_out.weight, Size: 0.524288 M +Parameter: perceiver.layers.0.feed_forward.0.weight, Size: 0.001024 M +Parameter: perceiver.layers.0.feed_forward.0.bias, Size: 0.001024 M +Parameter: perceiver.layers.0.feed_forward.1.weight, Size: 4.194304 M +Parameter: perceiver.layers.0.feed_forward.3.weight, Size: 4.194304 M +Parameter: perceiver.layers.1.norm_media.weight, Size: 0.001024 M +Parameter: perceiver.layers.1.norm_media.bias, Size: 0.001024 M +Parameter: perceiver.layers.1.norm_latents.weight, Size: 0.001024 M +Parameter: perceiver.layers.1.norm_latents.bias, Size: 0.001024 M +Parameter: perceiver.layers.1.to_q.weight, Size: 0.524288 M +Parameter: perceiver.layers.1.to_kv.weight, Size: 1.048576 M +Parameter: perceiver.layers.1.to_out.weight, Size: 0.524288 M +Parameter: perceiver.layers.1.feed_forward.0.weight, Size: 0.001024 M +Parameter: perceiver.layers.1.feed_forward.0.bias, Size: 0.001024 M +Parameter: perceiver.layers.1.feed_forward.1.weight, Size: 4.194304 M +Parameter: perceiver.layers.1.feed_forward.3.weight, Size: 4.194304 M +Parameter: perceiver.layers.2.norm_media.weight, Size: 0.001024 M +Parameter: perceiver.layers.2.norm_media.bias, Size: 0.001024 M +Parameter: perceiver.layers.2.norm_latents.weight, Size: 0.001024 M +Parameter: perceiver.layers.2.norm_latents.bias, Size: 0.001024 M +Parameter: perceiver.layers.2.to_q.weight, Size: 0.524288 M +Parameter: perceiver.layers.2.to_kv.weight, Size: 1.048576 M +Parameter: perceiver.layers.2.to_out.weight, Size: 0.524288 M +Parameter: perceiver.layers.2.feed_forward.0.weight, Size: 0.001024 M +Parameter: perceiver.layers.2.feed_forward.0.bias, Size: 0.001024 M +Parameter: perceiver.layers.2.feed_forward.1.weight, Size: 4.194304 M +Parameter: perceiver.layers.2.feed_forward.3.weight, Size: 4.194304 M +Parameter: perceiver.layers.3.norm_media.weight, Size: 0.001024 M +Parameter: perceiver.layers.3.norm_media.bias, Size: 0.001024 M +Parameter: perceiver.layers.3.norm_latents.weight, Size: 0.001024 M +Parameter: perceiver.layers.3.norm_latents.bias, Size: 0.001024 M +Parameter: perceiver.layers.3.to_q.weight, Size: 0.524288 M +Parameter: perceiver.layers.3.to_kv.weight, Size: 1.048576 M +Parameter: perceiver.layers.3.to_out.weight, Size: 0.524288 M +Parameter: perceiver.layers.3.feed_forward.0.weight, Size: 0.001024 M +Parameter: perceiver.layers.3.feed_forward.0.bias, Size: 0.001024 M +Parameter: perceiver.layers.3.feed_forward.1.weight, Size: 4.194304 M +Parameter: perceiver.layers.3.feed_forward.3.weight, Size: 4.194304 M +Parameter: perceiver.layers.4.norm_media.weight, Size: 0.001024 M +Parameter: perceiver.layers.4.norm_media.bias, Size: 0.001024 M +Parameter: perceiver.layers.4.norm_latents.weight, Size: 0.001024 M +Parameter: perceiver.layers.4.norm_latents.bias, Size: 0.001024 M +Parameter: perceiver.layers.4.to_q.weight, Size: 0.524288 M +Parameter: perceiver.layers.4.to_kv.weight, Size: 1.048576 M +Parameter: perceiver.layers.4.to_out.weight, Size: 0.524288 M +Parameter: perceiver.layers.4.feed_forward.0.weight, Size: 0.001024 M +Parameter: perceiver.layers.4.feed_forward.0.bias, Size: 0.001024 M +Parameter: perceiver.layers.4.feed_forward.1.weight, Size: 4.194304 M +Parameter: perceiver.layers.4.feed_forward.3.weight, Size: 4.194304 M +Parameter: perceiver.layers.5.norm_media.weight, Size: 0.001024 M +Parameter: perceiver.layers.5.norm_media.bias, Size: 0.001024 M +Parameter: perceiver.layers.5.norm_latents.weight, Size: 0.001024 M +Parameter: perceiver.layers.5.norm_latents.bias, Size: 0.001024 M +Parameter: perceiver.layers.5.to_q.weight, Size: 0.524288 M +Parameter: perceiver.layers.5.to_kv.weight, Size: 1.048576 M +Parameter: perceiver.layers.5.to_out.weight, Size: 0.524288 M +Parameter: perceiver.layers.5.feed_forward.0.weight, Size: 0.001024 M +Parameter: perceiver.layers.5.feed_forward.0.bias, Size: 0.001024 M +Parameter: perceiver.layers.5.feed_forward.1.weight, Size: 4.194304 M +Parameter: perceiver.layers.5.feed_forward.3.weight, Size: 4.194304 M +Parameter: perceiver.norm.weight, Size: 0.001024 M +Parameter: perceiver.norm.bias, Size: 0.001024 M +Total Trainable param: 1.441004 B +Imported class: + +DATASET: MMEDataset +-------------------- +=========== Cognition =========== +total score: 310.3571428571429 + code_reasoning score: 65.0 + numerical_calculation score: 65.0 + text_translation score: 82.5 + commonsense_reasoning score: 97.85714285714286 +=========== Perception =========== +total score: 946.0785314125651 + artwork score: 76.25 + celebrity score: 79.11764705882354 + count score: 93.33333333333333 + color score: 60.0 + position score: 48.333333333333336 + OCR score: 62.5 + landmark score: 118.25 + scene score: 157.75 + existence score: 173.33333333333334 + posters score: 77.2108843537415 + +-------------------------------------------------------------------------------- +Total Datasets Evaluated: 1 + +================================================================================ diff --git a/eval_results/eval_results_otter_direct_sft_nopretrain b/eval_results/eval_results_otter_direct_sft_nopretrain new file mode 100644 index 00000000..02b8eb48 --- /dev/null +++ b/eval_results/eval_results_otter_direct_sft_nopretrain @@ -0,0 +1,197 @@ +================================================================================ + EVALUATION REPORT +================================================================================ + + +MODEL INFO: {'name': 'otter_image', 'model_path': '/mnt/petrelfs/zhangyuanhan/Otter/checkpoints/otter_llava_direct_sft_nopretrain/epoch_1/'} +-------------------------------------------------------------------------------- +[2023-12-25 06:52:48,839] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect) +Imported class: +The current model version is configured for Otter-Image with max_num_frames set to None. +Parameter: lang_encoder.model.embed_tokens.weight, Size: 131.084288 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.attn_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.ff_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.attn.norm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.attn.norm.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.attn.to_q.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.attn.to_kv.weight, Size: 1.048576 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.attn.to_out.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.feed_forward.0.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.feed_forward.0.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.feed_forward.1.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.feed_forward.3.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.attn_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.ff_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.attn.norm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.attn.norm.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.attn.to_q.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.attn.to_kv.weight, Size: 1.048576 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.attn.to_out.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.feed_forward.0.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.feed_forward.0.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.feed_forward.1.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.feed_forward.3.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.attn_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.ff_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.attn.norm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.attn.norm.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.attn.to_q.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.attn.to_kv.weight, Size: 1.048576 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.attn.to_out.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.feed_forward.0.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.feed_forward.0.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.feed_forward.1.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.feed_forward.3.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.attn_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.ff_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.attn.norm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.attn.norm.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.attn.to_q.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.attn.to_kv.weight, Size: 1.048576 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.attn.to_out.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.feed_forward.0.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.feed_forward.0.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.feed_forward.1.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.feed_forward.3.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.attn_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.ff_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.attn.norm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.attn.norm.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.attn.to_q.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.attn.to_kv.weight, Size: 1.048576 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.attn.to_out.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.feed_forward.0.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.feed_forward.0.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.feed_forward.1.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.feed_forward.3.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.attn_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.ff_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.attn.norm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.attn.norm.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.attn.to_q.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.attn.to_kv.weight, Size: 1.048576 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.attn.to_out.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.feed_forward.0.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.feed_forward.0.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.feed_forward.1.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.feed_forward.3.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.attn_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.ff_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.attn.norm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.attn.norm.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.attn.to_q.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.attn.to_kv.weight, Size: 1.048576 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.attn.to_out.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.feed_forward.0.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.feed_forward.0.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.feed_forward.1.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.feed_forward.3.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.attn_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.ff_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.attn.norm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.attn.norm.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.attn.to_q.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.attn.to_kv.weight, Size: 1.048576 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.attn.to_out.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.feed_forward.0.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.feed_forward.0.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.feed_forward.1.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.feed_forward.3.weight, Size: 67.108864 M +Parameter: lang_encoder.lm_head.weight, Size: 131.084288 M +Parameter: perceiver.latents, Size: 0.065536 M +Parameter: perceiver.layers.0.norm_media.weight, Size: 0.001024 M +Parameter: perceiver.layers.0.norm_media.bias, Size: 0.001024 M +Parameter: perceiver.layers.0.norm_latents.weight, Size: 0.001024 M +Parameter: perceiver.layers.0.norm_latents.bias, Size: 0.001024 M +Parameter: perceiver.layers.0.to_q.weight, Size: 0.524288 M +Parameter: perceiver.layers.0.to_kv.weight, Size: 1.048576 M +Parameter: perceiver.layers.0.to_out.weight, Size: 0.524288 M +Parameter: perceiver.layers.0.feed_forward.0.weight, Size: 0.001024 M +Parameter: perceiver.layers.0.feed_forward.0.bias, Size: 0.001024 M +Parameter: perceiver.layers.0.feed_forward.1.weight, Size: 4.194304 M +Parameter: perceiver.layers.0.feed_forward.3.weight, Size: 4.194304 M +Parameter: perceiver.layers.1.norm_media.weight, Size: 0.001024 M +Parameter: perceiver.layers.1.norm_media.bias, Size: 0.001024 M +Parameter: perceiver.layers.1.norm_latents.weight, Size: 0.001024 M +Parameter: perceiver.layers.1.norm_latents.bias, Size: 0.001024 M +Parameter: perceiver.layers.1.to_q.weight, Size: 0.524288 M +Parameter: perceiver.layers.1.to_kv.weight, Size: 1.048576 M +Parameter: perceiver.layers.1.to_out.weight, Size: 0.524288 M +Parameter: perceiver.layers.1.feed_forward.0.weight, Size: 0.001024 M +Parameter: perceiver.layers.1.feed_forward.0.bias, Size: 0.001024 M +Parameter: perceiver.layers.1.feed_forward.1.weight, Size: 4.194304 M +Parameter: perceiver.layers.1.feed_forward.3.weight, Size: 4.194304 M +Parameter: perceiver.layers.2.norm_media.weight, Size: 0.001024 M +Parameter: perceiver.layers.2.norm_media.bias, Size: 0.001024 M +Parameter: perceiver.layers.2.norm_latents.weight, Size: 0.001024 M +Parameter: perceiver.layers.2.norm_latents.bias, Size: 0.001024 M +Parameter: perceiver.layers.2.to_q.weight, Size: 0.524288 M +Parameter: perceiver.layers.2.to_kv.weight, Size: 1.048576 M +Parameter: perceiver.layers.2.to_out.weight, Size: 0.524288 M +Parameter: perceiver.layers.2.feed_forward.0.weight, Size: 0.001024 M +Parameter: perceiver.layers.2.feed_forward.0.bias, Size: 0.001024 M +Parameter: perceiver.layers.2.feed_forward.1.weight, Size: 4.194304 M +Parameter: perceiver.layers.2.feed_forward.3.weight, Size: 4.194304 M +Parameter: perceiver.layers.3.norm_media.weight, Size: 0.001024 M +Parameter: perceiver.layers.3.norm_media.bias, Size: 0.001024 M +Parameter: perceiver.layers.3.norm_latents.weight, Size: 0.001024 M +Parameter: perceiver.layers.3.norm_latents.bias, Size: 0.001024 M +Parameter: perceiver.layers.3.to_q.weight, Size: 0.524288 M +Parameter: perceiver.layers.3.to_kv.weight, Size: 1.048576 M +Parameter: perceiver.layers.3.to_out.weight, Size: 0.524288 M +Parameter: perceiver.layers.3.feed_forward.0.weight, Size: 0.001024 M +Parameter: perceiver.layers.3.feed_forward.0.bias, Size: 0.001024 M +Parameter: perceiver.layers.3.feed_forward.1.weight, Size: 4.194304 M +Parameter: perceiver.layers.3.feed_forward.3.weight, Size: 4.194304 M +Parameter: perceiver.layers.4.norm_media.weight, Size: 0.001024 M +Parameter: perceiver.layers.4.norm_media.bias, Size: 0.001024 M +Parameter: perceiver.layers.4.norm_latents.weight, Size: 0.001024 M +Parameter: perceiver.layers.4.norm_latents.bias, Size: 0.001024 M +Parameter: perceiver.layers.4.to_q.weight, Size: 0.524288 M +Parameter: perceiver.layers.4.to_kv.weight, Size: 1.048576 M +Parameter: perceiver.layers.4.to_out.weight, Size: 0.524288 M +Parameter: perceiver.layers.4.feed_forward.0.weight, Size: 0.001024 M +Parameter: perceiver.layers.4.feed_forward.0.bias, Size: 0.001024 M +Parameter: perceiver.layers.4.feed_forward.1.weight, Size: 4.194304 M +Parameter: perceiver.layers.4.feed_forward.3.weight, Size: 4.194304 M +Parameter: perceiver.layers.5.norm_media.weight, Size: 0.001024 M +Parameter: perceiver.layers.5.norm_media.bias, Size: 0.001024 M +Parameter: perceiver.layers.5.norm_latents.weight, Size: 0.001024 M +Parameter: perceiver.layers.5.norm_latents.bias, Size: 0.001024 M +Parameter: perceiver.layers.5.to_q.weight, Size: 0.524288 M +Parameter: perceiver.layers.5.to_kv.weight, Size: 1.048576 M +Parameter: perceiver.layers.5.to_out.weight, Size: 0.524288 M +Parameter: perceiver.layers.5.feed_forward.0.weight, Size: 0.001024 M +Parameter: perceiver.layers.5.feed_forward.0.bias, Size: 0.001024 M +Parameter: perceiver.layers.5.feed_forward.1.weight, Size: 4.194304 M +Parameter: perceiver.layers.5.feed_forward.3.weight, Size: 4.194304 M +Parameter: perceiver.norm.weight, Size: 0.001024 M +Parameter: perceiver.norm.bias, Size: 0.001024 M +Total Trainable param: 1.441004 B +Imported class: + +DATASET: MMEDataset +-------------------- +=========== Cognition =========== +total score: 274.6428571428571 + code_reasoning score: 62.5 + numerical_calculation score: 47.5 + text_translation score: 77.5 + commonsense_reasoning score: 87.14285714285714 +=========== Perception =========== +total score: 909.7284913965585 + artwork score: 63.25 + celebrity score: 63.23529411764705 + count score: 101.66666666666667 + color score: 65.0 + position score: 56.666666666666664 + OCR score: 72.5 + landmark score: 103.25 + scene score: 149.5 + existence score: 185.0 + posters score: 49.65986394557823 + +-------------------------------------------------------------------------------- +Total Datasets Evaluated: 1 + +================================================================================ diff --git a/eval_results/eval_results_otter_direct_sft_nopretrain_llava_transform b/eval_results/eval_results_otter_direct_sft_nopretrain_llava_transform new file mode 100644 index 00000000..581bb4b6 --- /dev/null +++ b/eval_results/eval_results_otter_direct_sft_nopretrain_llava_transform @@ -0,0 +1,197 @@ +================================================================================ + EVALUATION REPORT +================================================================================ + + +MODEL INFO: {'name': 'otter_image_llava', 'model_path': '/mnt/petrelfs/zhangyuanhan/Otter/checkpoints/otter_llava_direct_sft_nopretrain_llava_transform/epoch_1/'} +-------------------------------------------------------------------------------- +[2023-12-26 08:58:52,667] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect) +Imported class: +The current model version is configured for Otter-Image with max_num_frames set to None. +Parameter: lang_encoder.model.embed_tokens.weight, Size: 131.084288 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.attn_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.ff_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.attn.norm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.attn.norm.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.attn.to_q.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.attn.to_kv.weight, Size: 1.048576 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.attn.to_out.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.feed_forward.0.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.feed_forward.0.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.feed_forward.1.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.feed_forward.3.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.attn_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.ff_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.attn.norm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.attn.norm.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.attn.to_q.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.attn.to_kv.weight, Size: 1.048576 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.attn.to_out.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.feed_forward.0.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.feed_forward.0.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.feed_forward.1.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.feed_forward.3.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.attn_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.ff_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.attn.norm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.attn.norm.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.attn.to_q.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.attn.to_kv.weight, Size: 1.048576 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.attn.to_out.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.feed_forward.0.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.feed_forward.0.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.feed_forward.1.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.feed_forward.3.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.attn_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.ff_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.attn.norm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.attn.norm.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.attn.to_q.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.attn.to_kv.weight, Size: 1.048576 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.attn.to_out.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.feed_forward.0.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.feed_forward.0.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.feed_forward.1.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.feed_forward.3.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.attn_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.ff_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.attn.norm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.attn.norm.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.attn.to_q.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.attn.to_kv.weight, Size: 1.048576 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.attn.to_out.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.feed_forward.0.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.feed_forward.0.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.feed_forward.1.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.feed_forward.3.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.attn_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.ff_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.attn.norm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.attn.norm.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.attn.to_q.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.attn.to_kv.weight, Size: 1.048576 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.attn.to_out.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.feed_forward.0.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.feed_forward.0.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.feed_forward.1.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.feed_forward.3.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.attn_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.ff_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.attn.norm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.attn.norm.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.attn.to_q.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.attn.to_kv.weight, Size: 1.048576 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.attn.to_out.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.feed_forward.0.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.feed_forward.0.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.feed_forward.1.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.feed_forward.3.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.attn_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.ff_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.attn.norm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.attn.norm.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.attn.to_q.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.attn.to_kv.weight, Size: 1.048576 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.attn.to_out.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.feed_forward.0.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.feed_forward.0.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.feed_forward.1.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.feed_forward.3.weight, Size: 67.108864 M +Parameter: lang_encoder.lm_head.weight, Size: 131.084288 M +Parameter: perceiver.latents, Size: 0.065536 M +Parameter: perceiver.layers.0.norm_media.weight, Size: 0.001024 M +Parameter: perceiver.layers.0.norm_media.bias, Size: 0.001024 M +Parameter: perceiver.layers.0.norm_latents.weight, Size: 0.001024 M +Parameter: perceiver.layers.0.norm_latents.bias, Size: 0.001024 M +Parameter: perceiver.layers.0.to_q.weight, Size: 0.524288 M +Parameter: perceiver.layers.0.to_kv.weight, Size: 1.048576 M +Parameter: perceiver.layers.0.to_out.weight, Size: 0.524288 M +Parameter: perceiver.layers.0.feed_forward.0.weight, Size: 0.001024 M +Parameter: perceiver.layers.0.feed_forward.0.bias, Size: 0.001024 M +Parameter: perceiver.layers.0.feed_forward.1.weight, Size: 4.194304 M +Parameter: perceiver.layers.0.feed_forward.3.weight, Size: 4.194304 M +Parameter: perceiver.layers.1.norm_media.weight, Size: 0.001024 M +Parameter: perceiver.layers.1.norm_media.bias, Size: 0.001024 M +Parameter: perceiver.layers.1.norm_latents.weight, Size: 0.001024 M +Parameter: perceiver.layers.1.norm_latents.bias, Size: 0.001024 M +Parameter: perceiver.layers.1.to_q.weight, Size: 0.524288 M +Parameter: perceiver.layers.1.to_kv.weight, Size: 1.048576 M +Parameter: perceiver.layers.1.to_out.weight, Size: 0.524288 M +Parameter: perceiver.layers.1.feed_forward.0.weight, Size: 0.001024 M +Parameter: perceiver.layers.1.feed_forward.0.bias, Size: 0.001024 M +Parameter: perceiver.layers.1.feed_forward.1.weight, Size: 4.194304 M +Parameter: perceiver.layers.1.feed_forward.3.weight, Size: 4.194304 M +Parameter: perceiver.layers.2.norm_media.weight, Size: 0.001024 M +Parameter: perceiver.layers.2.norm_media.bias, Size: 0.001024 M +Parameter: perceiver.layers.2.norm_latents.weight, Size: 0.001024 M +Parameter: perceiver.layers.2.norm_latents.bias, Size: 0.001024 M +Parameter: perceiver.layers.2.to_q.weight, Size: 0.524288 M +Parameter: perceiver.layers.2.to_kv.weight, Size: 1.048576 M +Parameter: perceiver.layers.2.to_out.weight, Size: 0.524288 M +Parameter: perceiver.layers.2.feed_forward.0.weight, Size: 0.001024 M +Parameter: perceiver.layers.2.feed_forward.0.bias, Size: 0.001024 M +Parameter: perceiver.layers.2.feed_forward.1.weight, Size: 4.194304 M +Parameter: perceiver.layers.2.feed_forward.3.weight, Size: 4.194304 M +Parameter: perceiver.layers.3.norm_media.weight, Size: 0.001024 M +Parameter: perceiver.layers.3.norm_media.bias, Size: 0.001024 M +Parameter: perceiver.layers.3.norm_latents.weight, Size: 0.001024 M +Parameter: perceiver.layers.3.norm_latents.bias, Size: 0.001024 M +Parameter: perceiver.layers.3.to_q.weight, Size: 0.524288 M +Parameter: perceiver.layers.3.to_kv.weight, Size: 1.048576 M +Parameter: perceiver.layers.3.to_out.weight, Size: 0.524288 M +Parameter: perceiver.layers.3.feed_forward.0.weight, Size: 0.001024 M +Parameter: perceiver.layers.3.feed_forward.0.bias, Size: 0.001024 M +Parameter: perceiver.layers.3.feed_forward.1.weight, Size: 4.194304 M +Parameter: perceiver.layers.3.feed_forward.3.weight, Size: 4.194304 M +Parameter: perceiver.layers.4.norm_media.weight, Size: 0.001024 M +Parameter: perceiver.layers.4.norm_media.bias, Size: 0.001024 M +Parameter: perceiver.layers.4.norm_latents.weight, Size: 0.001024 M +Parameter: perceiver.layers.4.norm_latents.bias, Size: 0.001024 M +Parameter: perceiver.layers.4.to_q.weight, Size: 0.524288 M +Parameter: perceiver.layers.4.to_kv.weight, Size: 1.048576 M +Parameter: perceiver.layers.4.to_out.weight, Size: 0.524288 M +Parameter: perceiver.layers.4.feed_forward.0.weight, Size: 0.001024 M +Parameter: perceiver.layers.4.feed_forward.0.bias, Size: 0.001024 M +Parameter: perceiver.layers.4.feed_forward.1.weight, Size: 4.194304 M +Parameter: perceiver.layers.4.feed_forward.3.weight, Size: 4.194304 M +Parameter: perceiver.layers.5.norm_media.weight, Size: 0.001024 M +Parameter: perceiver.layers.5.norm_media.bias, Size: 0.001024 M +Parameter: perceiver.layers.5.norm_latents.weight, Size: 0.001024 M +Parameter: perceiver.layers.5.norm_latents.bias, Size: 0.001024 M +Parameter: perceiver.layers.5.to_q.weight, Size: 0.524288 M +Parameter: perceiver.layers.5.to_kv.weight, Size: 1.048576 M +Parameter: perceiver.layers.5.to_out.weight, Size: 0.524288 M +Parameter: perceiver.layers.5.feed_forward.0.weight, Size: 0.001024 M +Parameter: perceiver.layers.5.feed_forward.0.bias, Size: 0.001024 M +Parameter: perceiver.layers.5.feed_forward.1.weight, Size: 4.194304 M +Parameter: perceiver.layers.5.feed_forward.3.weight, Size: 4.194304 M +Parameter: perceiver.norm.weight, Size: 0.001024 M +Parameter: perceiver.norm.bias, Size: 0.001024 M +Total Trainable param: 1.441004 B +Imported class: + +DATASET: MMEDataset +-------------------- +=========== Cognition =========== +total score: 316.42857142857144 + code_reasoning score: 92.5 + numerical_calculation score: 50.0 + text_translation score: 77.5 + commonsense_reasoning score: 96.42857142857143 +=========== Perception =========== +total score: 895.4419767907164 + artwork score: 84.0 + celebrity score: 80.58823529411765 + count score: 63.333333333333336 + color score: 68.33333333333334 + position score: 71.66666666666667 + OCR score: 102.5 + landmark score: 76.25 + scene score: 147.75 + existence score: 150.0 + posters score: 51.0204081632653 + +-------------------------------------------------------------------------------- +Total Datasets Evaluated: 1 + +================================================================================ diff --git a/eval_results/eval_results_otter_direct_sft_nopretrain_llava_transform_1epoch b/eval_results/eval_results_otter_direct_sft_nopretrain_llava_transform_1epoch new file mode 100644 index 00000000..c2fe2406 --- /dev/null +++ b/eval_results/eval_results_otter_direct_sft_nopretrain_llava_transform_1epoch @@ -0,0 +1,197 @@ +================================================================================ + EVALUATION REPORT +================================================================================ + + +MODEL INFO: {'name': 'otter_image_llava', 'model_path': '/mnt/petrelfs/zhangyuanhan/Otter/checkpoints/otter_llava_direct_sft_nopretrain_llava_transform_1epoch/epoch_1/'} +-------------------------------------------------------------------------------- +[2023-12-27 08:26:27,499] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect) +Imported class: +The current model version is configured for Otter-Image with max_num_frames set to None. +Parameter: lang_encoder.model.embed_tokens.weight, Size: 131.084288 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.attn_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.ff_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.attn.norm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.attn.norm.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.attn.to_q.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.attn.to_kv.weight, Size: 1.048576 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.attn.to_out.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.feed_forward.0.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.feed_forward.0.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.feed_forward.1.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.feed_forward.3.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.attn_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.ff_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.attn.norm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.attn.norm.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.attn.to_q.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.attn.to_kv.weight, Size: 1.048576 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.attn.to_out.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.feed_forward.0.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.feed_forward.0.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.feed_forward.1.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.feed_forward.3.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.attn_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.ff_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.attn.norm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.attn.norm.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.attn.to_q.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.attn.to_kv.weight, Size: 1.048576 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.attn.to_out.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.feed_forward.0.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.feed_forward.0.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.feed_forward.1.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.feed_forward.3.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.attn_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.ff_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.attn.norm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.attn.norm.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.attn.to_q.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.attn.to_kv.weight, Size: 1.048576 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.attn.to_out.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.feed_forward.0.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.feed_forward.0.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.feed_forward.1.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.feed_forward.3.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.attn_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.ff_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.attn.norm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.attn.norm.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.attn.to_q.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.attn.to_kv.weight, Size: 1.048576 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.attn.to_out.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.feed_forward.0.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.feed_forward.0.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.feed_forward.1.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.feed_forward.3.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.attn_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.ff_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.attn.norm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.attn.norm.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.attn.to_q.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.attn.to_kv.weight, Size: 1.048576 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.attn.to_out.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.feed_forward.0.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.feed_forward.0.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.feed_forward.1.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.feed_forward.3.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.attn_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.ff_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.attn.norm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.attn.norm.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.attn.to_q.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.attn.to_kv.weight, Size: 1.048576 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.attn.to_out.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.feed_forward.0.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.feed_forward.0.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.feed_forward.1.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.feed_forward.3.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.attn_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.ff_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.attn.norm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.attn.norm.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.attn.to_q.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.attn.to_kv.weight, Size: 1.048576 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.attn.to_out.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.feed_forward.0.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.feed_forward.0.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.feed_forward.1.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.feed_forward.3.weight, Size: 67.108864 M +Parameter: lang_encoder.lm_head.weight, Size: 131.084288 M +Parameter: perceiver.latents, Size: 0.065536 M +Parameter: perceiver.layers.0.norm_media.weight, Size: 0.001024 M +Parameter: perceiver.layers.0.norm_media.bias, Size: 0.001024 M +Parameter: perceiver.layers.0.norm_latents.weight, Size: 0.001024 M +Parameter: perceiver.layers.0.norm_latents.bias, Size: 0.001024 M +Parameter: perceiver.layers.0.to_q.weight, Size: 0.524288 M +Parameter: perceiver.layers.0.to_kv.weight, Size: 1.048576 M +Parameter: perceiver.layers.0.to_out.weight, Size: 0.524288 M +Parameter: perceiver.layers.0.feed_forward.0.weight, Size: 0.001024 M +Parameter: perceiver.layers.0.feed_forward.0.bias, Size: 0.001024 M +Parameter: perceiver.layers.0.feed_forward.1.weight, Size: 4.194304 M +Parameter: perceiver.layers.0.feed_forward.3.weight, Size: 4.194304 M +Parameter: perceiver.layers.1.norm_media.weight, Size: 0.001024 M +Parameter: perceiver.layers.1.norm_media.bias, Size: 0.001024 M +Parameter: perceiver.layers.1.norm_latents.weight, Size: 0.001024 M +Parameter: perceiver.layers.1.norm_latents.bias, Size: 0.001024 M +Parameter: perceiver.layers.1.to_q.weight, Size: 0.524288 M +Parameter: perceiver.layers.1.to_kv.weight, Size: 1.048576 M +Parameter: perceiver.layers.1.to_out.weight, Size: 0.524288 M +Parameter: perceiver.layers.1.feed_forward.0.weight, Size: 0.001024 M +Parameter: perceiver.layers.1.feed_forward.0.bias, Size: 0.001024 M +Parameter: perceiver.layers.1.feed_forward.1.weight, Size: 4.194304 M +Parameter: perceiver.layers.1.feed_forward.3.weight, Size: 4.194304 M +Parameter: perceiver.layers.2.norm_media.weight, Size: 0.001024 M +Parameter: perceiver.layers.2.norm_media.bias, Size: 0.001024 M +Parameter: perceiver.layers.2.norm_latents.weight, Size: 0.001024 M +Parameter: perceiver.layers.2.norm_latents.bias, Size: 0.001024 M +Parameter: perceiver.layers.2.to_q.weight, Size: 0.524288 M +Parameter: perceiver.layers.2.to_kv.weight, Size: 1.048576 M +Parameter: perceiver.layers.2.to_out.weight, Size: 0.524288 M +Parameter: perceiver.layers.2.feed_forward.0.weight, Size: 0.001024 M +Parameter: perceiver.layers.2.feed_forward.0.bias, Size: 0.001024 M +Parameter: perceiver.layers.2.feed_forward.1.weight, Size: 4.194304 M +Parameter: perceiver.layers.2.feed_forward.3.weight, Size: 4.194304 M +Parameter: perceiver.layers.3.norm_media.weight, Size: 0.001024 M +Parameter: perceiver.layers.3.norm_media.bias, Size: 0.001024 M +Parameter: perceiver.layers.3.norm_latents.weight, Size: 0.001024 M +Parameter: perceiver.layers.3.norm_latents.bias, Size: 0.001024 M +Parameter: perceiver.layers.3.to_q.weight, Size: 0.524288 M +Parameter: perceiver.layers.3.to_kv.weight, Size: 1.048576 M +Parameter: perceiver.layers.3.to_out.weight, Size: 0.524288 M +Parameter: perceiver.layers.3.feed_forward.0.weight, Size: 0.001024 M +Parameter: perceiver.layers.3.feed_forward.0.bias, Size: 0.001024 M +Parameter: perceiver.layers.3.feed_forward.1.weight, Size: 4.194304 M +Parameter: perceiver.layers.3.feed_forward.3.weight, Size: 4.194304 M +Parameter: perceiver.layers.4.norm_media.weight, Size: 0.001024 M +Parameter: perceiver.layers.4.norm_media.bias, Size: 0.001024 M +Parameter: perceiver.layers.4.norm_latents.weight, Size: 0.001024 M +Parameter: perceiver.layers.4.norm_latents.bias, Size: 0.001024 M +Parameter: perceiver.layers.4.to_q.weight, Size: 0.524288 M +Parameter: perceiver.layers.4.to_kv.weight, Size: 1.048576 M +Parameter: perceiver.layers.4.to_out.weight, Size: 0.524288 M +Parameter: perceiver.layers.4.feed_forward.0.weight, Size: 0.001024 M +Parameter: perceiver.layers.4.feed_forward.0.bias, Size: 0.001024 M +Parameter: perceiver.layers.4.feed_forward.1.weight, Size: 4.194304 M +Parameter: perceiver.layers.4.feed_forward.3.weight, Size: 4.194304 M +Parameter: perceiver.layers.5.norm_media.weight, Size: 0.001024 M +Parameter: perceiver.layers.5.norm_media.bias, Size: 0.001024 M +Parameter: perceiver.layers.5.norm_latents.weight, Size: 0.001024 M +Parameter: perceiver.layers.5.norm_latents.bias, Size: 0.001024 M +Parameter: perceiver.layers.5.to_q.weight, Size: 0.524288 M +Parameter: perceiver.layers.5.to_kv.weight, Size: 1.048576 M +Parameter: perceiver.layers.5.to_out.weight, Size: 0.524288 M +Parameter: perceiver.layers.5.feed_forward.0.weight, Size: 0.001024 M +Parameter: perceiver.layers.5.feed_forward.0.bias, Size: 0.001024 M +Parameter: perceiver.layers.5.feed_forward.1.weight, Size: 4.194304 M +Parameter: perceiver.layers.5.feed_forward.3.weight, Size: 4.194304 M +Parameter: perceiver.norm.weight, Size: 0.001024 M +Parameter: perceiver.norm.bias, Size: 0.001024 M +Total Trainable param: 1.441004 B +Imported class: + +DATASET: MMEDataset +-------------------- +=========== Cognition =========== +total score: 231.07142857142856 + code_reasoning score: 50.0 + numerical_calculation score: 47.5 + text_translation score: 50.0 + commonsense_reasoning score: 83.57142857142857 +=========== Perception =========== +total score: 831.0618247298919 + artwork score: 61.75 + celebrity score: 63.23529411764706 + count score: 86.66666666666666 + color score: 70.0 + position score: 60.0 + OCR score: 87.5 + landmark score: 75.5 + scene score: 131.75 + existence score: 145.0 + posters score: 49.65986394557824 + +-------------------------------------------------------------------------------- +Total Datasets Evaluated: 1 + +================================================================================ diff --git a/eval_results/eval_results_otter_direct_sft_nopretrain_llava_transform_lr2e-4 b/eval_results/eval_results_otter_direct_sft_nopretrain_llava_transform_lr2e-4 new file mode 100644 index 00000000..a9bb2a82 --- /dev/null +++ b/eval_results/eval_results_otter_direct_sft_nopretrain_llava_transform_lr2e-4 @@ -0,0 +1,197 @@ +================================================================================ + EVALUATION REPORT +================================================================================ + + +MODEL INFO: {'name': 'otter_image_llava', 'model_path': '/mnt/petrelfs/zhangyuanhan/Otter/checkpoints/otter_llava_direct_sft_nopretrain_llava_transform_lr2e-4/epoch_1/'} +-------------------------------------------------------------------------------- +[2023-12-26 08:59:24,855] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect) +Imported class: +The current model version is configured for Otter-Image with max_num_frames set to None. +Parameter: lang_encoder.model.embed_tokens.weight, Size: 131.084288 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.attn_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.ff_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.attn.norm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.attn.norm.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.attn.to_q.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.attn.to_kv.weight, Size: 1.048576 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.attn.to_out.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.feed_forward.0.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.feed_forward.0.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.feed_forward.1.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.feed_forward.3.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.attn_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.ff_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.attn.norm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.attn.norm.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.attn.to_q.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.attn.to_kv.weight, Size: 1.048576 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.attn.to_out.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.feed_forward.0.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.feed_forward.0.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.feed_forward.1.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.feed_forward.3.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.attn_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.ff_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.attn.norm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.attn.norm.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.attn.to_q.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.attn.to_kv.weight, Size: 1.048576 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.attn.to_out.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.feed_forward.0.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.feed_forward.0.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.feed_forward.1.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.feed_forward.3.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.attn_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.ff_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.attn.norm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.attn.norm.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.attn.to_q.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.attn.to_kv.weight, Size: 1.048576 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.attn.to_out.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.feed_forward.0.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.feed_forward.0.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.feed_forward.1.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.feed_forward.3.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.attn_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.ff_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.attn.norm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.attn.norm.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.attn.to_q.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.attn.to_kv.weight, Size: 1.048576 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.attn.to_out.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.feed_forward.0.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.feed_forward.0.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.feed_forward.1.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.feed_forward.3.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.attn_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.ff_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.attn.norm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.attn.norm.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.attn.to_q.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.attn.to_kv.weight, Size: 1.048576 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.attn.to_out.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.feed_forward.0.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.feed_forward.0.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.feed_forward.1.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.feed_forward.3.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.attn_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.ff_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.attn.norm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.attn.norm.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.attn.to_q.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.attn.to_kv.weight, Size: 1.048576 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.attn.to_out.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.feed_forward.0.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.feed_forward.0.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.feed_forward.1.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.feed_forward.3.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.attn_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.ff_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.attn.norm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.attn.norm.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.attn.to_q.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.attn.to_kv.weight, Size: 1.048576 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.attn.to_out.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.feed_forward.0.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.feed_forward.0.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.feed_forward.1.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.feed_forward.3.weight, Size: 67.108864 M +Parameter: lang_encoder.lm_head.weight, Size: 131.084288 M +Parameter: perceiver.latents, Size: 0.065536 M +Parameter: perceiver.layers.0.norm_media.weight, Size: 0.001024 M +Parameter: perceiver.layers.0.norm_media.bias, Size: 0.001024 M +Parameter: perceiver.layers.0.norm_latents.weight, Size: 0.001024 M +Parameter: perceiver.layers.0.norm_latents.bias, Size: 0.001024 M +Parameter: perceiver.layers.0.to_q.weight, Size: 0.524288 M +Parameter: perceiver.layers.0.to_kv.weight, Size: 1.048576 M +Parameter: perceiver.layers.0.to_out.weight, Size: 0.524288 M +Parameter: perceiver.layers.0.feed_forward.0.weight, Size: 0.001024 M +Parameter: perceiver.layers.0.feed_forward.0.bias, Size: 0.001024 M +Parameter: perceiver.layers.0.feed_forward.1.weight, Size: 4.194304 M +Parameter: perceiver.layers.0.feed_forward.3.weight, Size: 4.194304 M +Parameter: perceiver.layers.1.norm_media.weight, Size: 0.001024 M +Parameter: perceiver.layers.1.norm_media.bias, Size: 0.001024 M +Parameter: perceiver.layers.1.norm_latents.weight, Size: 0.001024 M +Parameter: perceiver.layers.1.norm_latents.bias, Size: 0.001024 M +Parameter: perceiver.layers.1.to_q.weight, Size: 0.524288 M +Parameter: perceiver.layers.1.to_kv.weight, Size: 1.048576 M +Parameter: perceiver.layers.1.to_out.weight, Size: 0.524288 M +Parameter: perceiver.layers.1.feed_forward.0.weight, Size: 0.001024 M +Parameter: perceiver.layers.1.feed_forward.0.bias, Size: 0.001024 M +Parameter: perceiver.layers.1.feed_forward.1.weight, Size: 4.194304 M +Parameter: perceiver.layers.1.feed_forward.3.weight, Size: 4.194304 M +Parameter: perceiver.layers.2.norm_media.weight, Size: 0.001024 M +Parameter: perceiver.layers.2.norm_media.bias, Size: 0.001024 M +Parameter: perceiver.layers.2.norm_latents.weight, Size: 0.001024 M +Parameter: perceiver.layers.2.norm_latents.bias, Size: 0.001024 M +Parameter: perceiver.layers.2.to_q.weight, Size: 0.524288 M +Parameter: perceiver.layers.2.to_kv.weight, Size: 1.048576 M +Parameter: perceiver.layers.2.to_out.weight, Size: 0.524288 M +Parameter: perceiver.layers.2.feed_forward.0.weight, Size: 0.001024 M +Parameter: perceiver.layers.2.feed_forward.0.bias, Size: 0.001024 M +Parameter: perceiver.layers.2.feed_forward.1.weight, Size: 4.194304 M +Parameter: perceiver.layers.2.feed_forward.3.weight, Size: 4.194304 M +Parameter: perceiver.layers.3.norm_media.weight, Size: 0.001024 M +Parameter: perceiver.layers.3.norm_media.bias, Size: 0.001024 M +Parameter: perceiver.layers.3.norm_latents.weight, Size: 0.001024 M +Parameter: perceiver.layers.3.norm_latents.bias, Size: 0.001024 M +Parameter: perceiver.layers.3.to_q.weight, Size: 0.524288 M +Parameter: perceiver.layers.3.to_kv.weight, Size: 1.048576 M +Parameter: perceiver.layers.3.to_out.weight, Size: 0.524288 M +Parameter: perceiver.layers.3.feed_forward.0.weight, Size: 0.001024 M +Parameter: perceiver.layers.3.feed_forward.0.bias, Size: 0.001024 M +Parameter: perceiver.layers.3.feed_forward.1.weight, Size: 4.194304 M +Parameter: perceiver.layers.3.feed_forward.3.weight, Size: 4.194304 M +Parameter: perceiver.layers.4.norm_media.weight, Size: 0.001024 M +Parameter: perceiver.layers.4.norm_media.bias, Size: 0.001024 M +Parameter: perceiver.layers.4.norm_latents.weight, Size: 0.001024 M +Parameter: perceiver.layers.4.norm_latents.bias, Size: 0.001024 M +Parameter: perceiver.layers.4.to_q.weight, Size: 0.524288 M +Parameter: perceiver.layers.4.to_kv.weight, Size: 1.048576 M +Parameter: perceiver.layers.4.to_out.weight, Size: 0.524288 M +Parameter: perceiver.layers.4.feed_forward.0.weight, Size: 0.001024 M +Parameter: perceiver.layers.4.feed_forward.0.bias, Size: 0.001024 M +Parameter: perceiver.layers.4.feed_forward.1.weight, Size: 4.194304 M +Parameter: perceiver.layers.4.feed_forward.3.weight, Size: 4.194304 M +Parameter: perceiver.layers.5.norm_media.weight, Size: 0.001024 M +Parameter: perceiver.layers.5.norm_media.bias, Size: 0.001024 M +Parameter: perceiver.layers.5.norm_latents.weight, Size: 0.001024 M +Parameter: perceiver.layers.5.norm_latents.bias, Size: 0.001024 M +Parameter: perceiver.layers.5.to_q.weight, Size: 0.524288 M +Parameter: perceiver.layers.5.to_kv.weight, Size: 1.048576 M +Parameter: perceiver.layers.5.to_out.weight, Size: 0.524288 M +Parameter: perceiver.layers.5.feed_forward.0.weight, Size: 0.001024 M +Parameter: perceiver.layers.5.feed_forward.0.bias, Size: 0.001024 M +Parameter: perceiver.layers.5.feed_forward.1.weight, Size: 4.194304 M +Parameter: perceiver.layers.5.feed_forward.3.weight, Size: 4.194304 M +Parameter: perceiver.norm.weight, Size: 0.001024 M +Parameter: perceiver.norm.bias, Size: 0.001024 M +Total Trainable param: 1.441004 B +Imported class: + +DATASET: MMEDataset +-------------------- +=========== Cognition =========== +total score: 213.57142857142856 + code_reasoning score: 50.0 + numerical_calculation score: 50.0 + text_translation score: 50.0 + commonsense_reasoning score: 63.57142857142857 +=========== Perception =========== +total score: 509.50390156062423 + artwork score: 55.24999999999999 + celebrity score: 43.52941176470588 + count score: 50.0 + color score: 50.0 + position score: 50.0 + OCR score: 50.0 + landmark score: 49.75 + scene score: 49.75 + existence score: 50.0 + posters score: 61.224489795918366 + +-------------------------------------------------------------------------------- +Total Datasets Evaluated: 1 + +================================================================================ diff --git a/eval_results/eval_results_otter_direct_sft_nopretrain_llava_transform_lr2e-4_1epoch b/eval_results/eval_results_otter_direct_sft_nopretrain_llava_transform_lr2e-4_1epoch new file mode 100644 index 00000000..daaff8c9 --- /dev/null +++ b/eval_results/eval_results_otter_direct_sft_nopretrain_llava_transform_lr2e-4_1epoch @@ -0,0 +1,197 @@ +================================================================================ + EVALUATION REPORT +================================================================================ + + +MODEL INFO: {'name': 'otter_image_llava', 'model_path': '/mnt/petrelfs/zhangyuanhan/Otter/checkpoints/otter_llava_direct_sft_nopretrain_llava_transform_lr2e-4_1epoch/epoch_1/'} +-------------------------------------------------------------------------------- +[2023-12-27 08:18:19,937] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect) +Imported class: +The current model version is configured for Otter-Image with max_num_frames set to None. +Parameter: lang_encoder.model.embed_tokens.weight, Size: 131.084288 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.attn_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.ff_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.attn.norm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.attn.norm.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.attn.to_q.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.attn.to_kv.weight, Size: 1.048576 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.attn.to_out.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.feed_forward.0.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.feed_forward.0.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.feed_forward.1.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.feed_forward.3.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.attn_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.ff_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.attn.norm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.attn.norm.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.attn.to_q.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.attn.to_kv.weight, Size: 1.048576 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.attn.to_out.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.feed_forward.0.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.feed_forward.0.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.feed_forward.1.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.feed_forward.3.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.attn_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.ff_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.attn.norm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.attn.norm.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.attn.to_q.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.attn.to_kv.weight, Size: 1.048576 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.attn.to_out.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.feed_forward.0.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.feed_forward.0.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.feed_forward.1.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.feed_forward.3.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.attn_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.ff_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.attn.norm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.attn.norm.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.attn.to_q.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.attn.to_kv.weight, Size: 1.048576 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.attn.to_out.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.feed_forward.0.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.feed_forward.0.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.feed_forward.1.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.feed_forward.3.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.attn_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.ff_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.attn.norm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.attn.norm.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.attn.to_q.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.attn.to_kv.weight, Size: 1.048576 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.attn.to_out.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.feed_forward.0.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.feed_forward.0.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.feed_forward.1.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.feed_forward.3.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.attn_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.ff_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.attn.norm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.attn.norm.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.attn.to_q.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.attn.to_kv.weight, Size: 1.048576 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.attn.to_out.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.feed_forward.0.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.feed_forward.0.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.feed_forward.1.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.feed_forward.3.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.attn_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.ff_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.attn.norm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.attn.norm.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.attn.to_q.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.attn.to_kv.weight, Size: 1.048576 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.attn.to_out.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.feed_forward.0.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.feed_forward.0.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.feed_forward.1.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.feed_forward.3.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.attn_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.ff_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.attn.norm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.attn.norm.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.attn.to_q.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.attn.to_kv.weight, Size: 1.048576 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.attn.to_out.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.feed_forward.0.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.feed_forward.0.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.feed_forward.1.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.feed_forward.3.weight, Size: 67.108864 M +Parameter: lang_encoder.lm_head.weight, Size: 131.084288 M +Parameter: perceiver.latents, Size: 0.065536 M +Parameter: perceiver.layers.0.norm_media.weight, Size: 0.001024 M +Parameter: perceiver.layers.0.norm_media.bias, Size: 0.001024 M +Parameter: perceiver.layers.0.norm_latents.weight, Size: 0.001024 M +Parameter: perceiver.layers.0.norm_latents.bias, Size: 0.001024 M +Parameter: perceiver.layers.0.to_q.weight, Size: 0.524288 M +Parameter: perceiver.layers.0.to_kv.weight, Size: 1.048576 M +Parameter: perceiver.layers.0.to_out.weight, Size: 0.524288 M +Parameter: perceiver.layers.0.feed_forward.0.weight, Size: 0.001024 M +Parameter: perceiver.layers.0.feed_forward.0.bias, Size: 0.001024 M +Parameter: perceiver.layers.0.feed_forward.1.weight, Size: 4.194304 M +Parameter: perceiver.layers.0.feed_forward.3.weight, Size: 4.194304 M +Parameter: perceiver.layers.1.norm_media.weight, Size: 0.001024 M +Parameter: perceiver.layers.1.norm_media.bias, Size: 0.001024 M +Parameter: perceiver.layers.1.norm_latents.weight, Size: 0.001024 M +Parameter: perceiver.layers.1.norm_latents.bias, Size: 0.001024 M +Parameter: perceiver.layers.1.to_q.weight, Size: 0.524288 M +Parameter: perceiver.layers.1.to_kv.weight, Size: 1.048576 M +Parameter: perceiver.layers.1.to_out.weight, Size: 0.524288 M +Parameter: perceiver.layers.1.feed_forward.0.weight, Size: 0.001024 M +Parameter: perceiver.layers.1.feed_forward.0.bias, Size: 0.001024 M +Parameter: perceiver.layers.1.feed_forward.1.weight, Size: 4.194304 M +Parameter: perceiver.layers.1.feed_forward.3.weight, Size: 4.194304 M +Parameter: perceiver.layers.2.norm_media.weight, Size: 0.001024 M +Parameter: perceiver.layers.2.norm_media.bias, Size: 0.001024 M +Parameter: perceiver.layers.2.norm_latents.weight, Size: 0.001024 M +Parameter: perceiver.layers.2.norm_latents.bias, Size: 0.001024 M +Parameter: perceiver.layers.2.to_q.weight, Size: 0.524288 M +Parameter: perceiver.layers.2.to_kv.weight, Size: 1.048576 M +Parameter: perceiver.layers.2.to_out.weight, Size: 0.524288 M +Parameter: perceiver.layers.2.feed_forward.0.weight, Size: 0.001024 M +Parameter: perceiver.layers.2.feed_forward.0.bias, Size: 0.001024 M +Parameter: perceiver.layers.2.feed_forward.1.weight, Size: 4.194304 M +Parameter: perceiver.layers.2.feed_forward.3.weight, Size: 4.194304 M +Parameter: perceiver.layers.3.norm_media.weight, Size: 0.001024 M +Parameter: perceiver.layers.3.norm_media.bias, Size: 0.001024 M +Parameter: perceiver.layers.3.norm_latents.weight, Size: 0.001024 M +Parameter: perceiver.layers.3.norm_latents.bias, Size: 0.001024 M +Parameter: perceiver.layers.3.to_q.weight, Size: 0.524288 M +Parameter: perceiver.layers.3.to_kv.weight, Size: 1.048576 M +Parameter: perceiver.layers.3.to_out.weight, Size: 0.524288 M +Parameter: perceiver.layers.3.feed_forward.0.weight, Size: 0.001024 M +Parameter: perceiver.layers.3.feed_forward.0.bias, Size: 0.001024 M +Parameter: perceiver.layers.3.feed_forward.1.weight, Size: 4.194304 M +Parameter: perceiver.layers.3.feed_forward.3.weight, Size: 4.194304 M +Parameter: perceiver.layers.4.norm_media.weight, Size: 0.001024 M +Parameter: perceiver.layers.4.norm_media.bias, Size: 0.001024 M +Parameter: perceiver.layers.4.norm_latents.weight, Size: 0.001024 M +Parameter: perceiver.layers.4.norm_latents.bias, Size: 0.001024 M +Parameter: perceiver.layers.4.to_q.weight, Size: 0.524288 M +Parameter: perceiver.layers.4.to_kv.weight, Size: 1.048576 M +Parameter: perceiver.layers.4.to_out.weight, Size: 0.524288 M +Parameter: perceiver.layers.4.feed_forward.0.weight, Size: 0.001024 M +Parameter: perceiver.layers.4.feed_forward.0.bias, Size: 0.001024 M +Parameter: perceiver.layers.4.feed_forward.1.weight, Size: 4.194304 M +Parameter: perceiver.layers.4.feed_forward.3.weight, Size: 4.194304 M +Parameter: perceiver.layers.5.norm_media.weight, Size: 0.001024 M +Parameter: perceiver.layers.5.norm_media.bias, Size: 0.001024 M +Parameter: perceiver.layers.5.norm_latents.weight, Size: 0.001024 M +Parameter: perceiver.layers.5.norm_latents.bias, Size: 0.001024 M +Parameter: perceiver.layers.5.to_q.weight, Size: 0.524288 M +Parameter: perceiver.layers.5.to_kv.weight, Size: 1.048576 M +Parameter: perceiver.layers.5.to_out.weight, Size: 0.524288 M +Parameter: perceiver.layers.5.feed_forward.0.weight, Size: 0.001024 M +Parameter: perceiver.layers.5.feed_forward.0.bias, Size: 0.001024 M +Parameter: perceiver.layers.5.feed_forward.1.weight, Size: 4.194304 M +Parameter: perceiver.layers.5.feed_forward.3.weight, Size: 4.194304 M +Parameter: perceiver.norm.weight, Size: 0.001024 M +Parameter: perceiver.norm.bias, Size: 0.001024 M +Total Trainable param: 1.441004 B +Imported class: + +DATASET: MMEDataset +-------------------- +=========== Cognition =========== +total score: 212.14285714285714 + code_reasoning score: 50.0 + numerical_calculation score: 50.0 + text_translation score: 50.0 + commonsense_reasoning score: 62.14285714285714 +=========== Perception =========== +total score: 495.21188475390153 + artwork score: 41.75 + celebrity score: 47.05882352941176 + count score: 58.333333333333336 + color score: 50.0 + position score: 50.0 + OCR score: 50.0 + landmark score: 50.74999999999999 + scene score: 48.0 + existence score: 50.0 + posters score: 49.31972789115646 + +-------------------------------------------------------------------------------- +Total Datasets Evaluated: 1 + +================================================================================ diff --git a/eval_results/eval_results_otter_lora b/eval_results/eval_results_otter_lora new file mode 100644 index 00000000..4b540843 --- /dev/null +++ b/eval_results/eval_results_otter_lora @@ -0,0 +1,328 @@ +================================================================================ + EVALUATION REPORT +================================================================================ + + +MODEL INFO: {'name': 'otter_image', 'model_path': '/mnt/petrelfs/zhangyuanhan/Otter/checkpoints/otter_llava_sft_nonconv_nogroup_lora/epoch_1/'} +-------------------------------------------------------------------------------- +[2023-12-21 22:41:01,034] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect) +Imported class: +The current model version is configured for Otter-Image with max_num_frames set to None. +Using LoRA with config:{'lora_alpha': 256, 'lora_dropout': 0.05, 'r': 128} +trainable params: 67,108,864 || all params: 7,921,365,008 || trainable%: 0.8471881289679866 +LoRA trainable param: 67.109 M +Parameter: lang_encoder.base_model.model.model.embed_tokens.weight, Size: 131.084288 M +Parameter: lang_encoder.base_model.model.model.layers.0.decoder_layer.self_attn.q_proj.lora_A.default.weight, Size: 0.524288 M +Parameter: lang_encoder.base_model.model.model.layers.0.decoder_layer.self_attn.q_proj.lora_B.default.weight, Size: 0.524288 M +Parameter: lang_encoder.base_model.model.model.layers.0.decoder_layer.self_attn.v_proj.lora_A.default.weight, Size: 0.524288 M +Parameter: lang_encoder.base_model.model.model.layers.0.decoder_layer.self_attn.v_proj.lora_B.default.weight, Size: 0.524288 M +Parameter: lang_encoder.base_model.model.model.layers.1.decoder_layer.self_attn.q_proj.lora_A.default.weight, Size: 0.524288 M +Parameter: lang_encoder.base_model.model.model.layers.1.decoder_layer.self_attn.q_proj.lora_B.default.weight, Size: 0.524288 M +Parameter: lang_encoder.base_model.model.model.layers.1.decoder_layer.self_attn.v_proj.lora_A.default.weight, Size: 0.524288 M +Parameter: lang_encoder.base_model.model.model.layers.1.decoder_layer.self_attn.v_proj.lora_B.default.weight, Size: 0.524288 M +Parameter: lang_encoder.base_model.model.model.layers.2.decoder_layer.self_attn.q_proj.lora_A.default.weight, Size: 0.524288 M +Parameter: lang_encoder.base_model.model.model.layers.2.decoder_layer.self_attn.q_proj.lora_B.default.weight, Size: 0.524288 M +Parameter: lang_encoder.base_model.model.model.layers.2.decoder_layer.self_attn.v_proj.lora_A.default.weight, Size: 0.524288 M +Parameter: lang_encoder.base_model.model.model.layers.2.decoder_layer.self_attn.v_proj.lora_B.default.weight, Size: 0.524288 M +Parameter: lang_encoder.base_model.model.model.layers.3.gated_cross_attn_layer.attn_gate, Size: 0.000001 M +Parameter: lang_encoder.base_model.model.model.layers.3.gated_cross_attn_layer.ff_gate, Size: 0.000001 M +Parameter: lang_encoder.base_model.model.model.layers.3.gated_cross_attn_layer.attn.norm.weight, Size: 0.004096 M +Parameter: lang_encoder.base_model.model.model.layers.3.gated_cross_attn_layer.attn.norm.bias, Size: 0.004096 M +Parameter: lang_encoder.base_model.model.model.layers.3.gated_cross_attn_layer.attn.to_q.weight, Size: 2.097152 M +Parameter: lang_encoder.base_model.model.model.layers.3.gated_cross_attn_layer.attn.to_kv.weight, Size: 1.048576 M +Parameter: lang_encoder.base_model.model.model.layers.3.gated_cross_attn_layer.attn.to_out.weight, Size: 2.097152 M +Parameter: lang_encoder.base_model.model.model.layers.3.gated_cross_attn_layer.feed_forward.0.weight, Size: 0.004096 M +Parameter: lang_encoder.base_model.model.model.layers.3.gated_cross_attn_layer.feed_forward.0.bias, Size: 0.004096 M +Parameter: lang_encoder.base_model.model.model.layers.3.gated_cross_attn_layer.feed_forward.1.weight, Size: 67.108864 M +Parameter: lang_encoder.base_model.model.model.layers.3.gated_cross_attn_layer.feed_forward.3.weight, Size: 67.108864 M +Parameter: lang_encoder.base_model.model.model.layers.3.decoder_layer.self_attn.q_proj.lora_A.default.weight, Size: 0.524288 M +Parameter: lang_encoder.base_model.model.model.layers.3.decoder_layer.self_attn.q_proj.lora_B.default.weight, Size: 0.524288 M +Parameter: lang_encoder.base_model.model.model.layers.3.decoder_layer.self_attn.v_proj.lora_A.default.weight, Size: 0.524288 M +Parameter: lang_encoder.base_model.model.model.layers.3.decoder_layer.self_attn.v_proj.lora_B.default.weight, Size: 0.524288 M +Parameter: lang_encoder.base_model.model.model.layers.4.decoder_layer.self_attn.q_proj.lora_A.default.weight, Size: 0.524288 M +Parameter: lang_encoder.base_model.model.model.layers.4.decoder_layer.self_attn.q_proj.lora_B.default.weight, Size: 0.524288 M +Parameter: lang_encoder.base_model.model.model.layers.4.decoder_layer.self_attn.v_proj.lora_A.default.weight, Size: 0.524288 M +Parameter: lang_encoder.base_model.model.model.layers.4.decoder_layer.self_attn.v_proj.lora_B.default.weight, Size: 0.524288 M +Parameter: lang_encoder.base_model.model.model.layers.5.decoder_layer.self_attn.q_proj.lora_A.default.weight, Size: 0.524288 M +Parameter: lang_encoder.base_model.model.model.layers.5.decoder_layer.self_attn.q_proj.lora_B.default.weight, Size: 0.524288 M +Parameter: lang_encoder.base_model.model.model.layers.5.decoder_layer.self_attn.v_proj.lora_A.default.weight, Size: 0.524288 M +Parameter: lang_encoder.base_model.model.model.layers.5.decoder_layer.self_attn.v_proj.lora_B.default.weight, Size: 0.524288 M +Parameter: lang_encoder.base_model.model.model.layers.6.decoder_layer.self_attn.q_proj.lora_A.default.weight, Size: 0.524288 M +Parameter: lang_encoder.base_model.model.model.layers.6.decoder_layer.self_attn.q_proj.lora_B.default.weight, Size: 0.524288 M +Parameter: lang_encoder.base_model.model.model.layers.6.decoder_layer.self_attn.v_proj.lora_A.default.weight, Size: 0.524288 M +Parameter: lang_encoder.base_model.model.model.layers.6.decoder_layer.self_attn.v_proj.lora_B.default.weight, Size: 0.524288 M +Parameter: lang_encoder.base_model.model.model.layers.7.gated_cross_attn_layer.attn_gate, Size: 0.000001 M +Parameter: lang_encoder.base_model.model.model.layers.7.gated_cross_attn_layer.ff_gate, Size: 0.000001 M +Parameter: lang_encoder.base_model.model.model.layers.7.gated_cross_attn_layer.attn.norm.weight, Size: 0.004096 M +Parameter: lang_encoder.base_model.model.model.layers.7.gated_cross_attn_layer.attn.norm.bias, Size: 0.004096 M +Parameter: lang_encoder.base_model.model.model.layers.7.gated_cross_attn_layer.attn.to_q.weight, Size: 2.097152 M +Parameter: lang_encoder.base_model.model.model.layers.7.gated_cross_attn_layer.attn.to_kv.weight, Size: 1.048576 M +Parameter: lang_encoder.base_model.model.model.layers.7.gated_cross_attn_layer.attn.to_out.weight, Size: 2.097152 M +Parameter: lang_encoder.base_model.model.model.layers.7.gated_cross_attn_layer.feed_forward.0.weight, Size: 0.004096 M +Parameter: lang_encoder.base_model.model.model.layers.7.gated_cross_attn_layer.feed_forward.0.bias, Size: 0.004096 M +Parameter: lang_encoder.base_model.model.model.layers.7.gated_cross_attn_layer.feed_forward.1.weight, Size: 67.108864 M +Parameter: lang_encoder.base_model.model.model.layers.7.gated_cross_attn_layer.feed_forward.3.weight, Size: 67.108864 M +Parameter: lang_encoder.base_model.model.model.layers.7.decoder_layer.self_attn.q_proj.lora_A.default.weight, Size: 0.524288 M +Parameter: lang_encoder.base_model.model.model.layers.7.decoder_layer.self_attn.q_proj.lora_B.default.weight, Size: 0.524288 M +Parameter: lang_encoder.base_model.model.model.layers.7.decoder_layer.self_attn.v_proj.lora_A.default.weight, Size: 0.524288 M +Parameter: lang_encoder.base_model.model.model.layers.7.decoder_layer.self_attn.v_proj.lora_B.default.weight, Size: 0.524288 M +Parameter: lang_encoder.base_model.model.model.layers.8.decoder_layer.self_attn.q_proj.lora_A.default.weight, Size: 0.524288 M +Parameter: lang_encoder.base_model.model.model.layers.8.decoder_layer.self_attn.q_proj.lora_B.default.weight, Size: 0.524288 M +Parameter: lang_encoder.base_model.model.model.layers.8.decoder_layer.self_attn.v_proj.lora_A.default.weight, Size: 0.524288 M +Parameter: lang_encoder.base_model.model.model.layers.8.decoder_layer.self_attn.v_proj.lora_B.default.weight, Size: 0.524288 M +Parameter: lang_encoder.base_model.model.model.layers.9.decoder_layer.self_attn.q_proj.lora_A.default.weight, Size: 0.524288 M +Parameter: lang_encoder.base_model.model.model.layers.9.decoder_layer.self_attn.q_proj.lora_B.default.weight, Size: 0.524288 M +Parameter: lang_encoder.base_model.model.model.layers.9.decoder_layer.self_attn.v_proj.lora_A.default.weight, Size: 0.524288 M +Parameter: lang_encoder.base_model.model.model.layers.9.decoder_layer.self_attn.v_proj.lora_B.default.weight, Size: 0.524288 M +Parameter: lang_encoder.base_model.model.model.layers.10.decoder_layer.self_attn.q_proj.lora_A.default.weight, Size: 0.524288 M +Parameter: lang_encoder.base_model.model.model.layers.10.decoder_layer.self_attn.q_proj.lora_B.default.weight, Size: 0.524288 M +Parameter: lang_encoder.base_model.model.model.layers.10.decoder_layer.self_attn.v_proj.lora_A.default.weight, Size: 0.524288 M +Parameter: lang_encoder.base_model.model.model.layers.10.decoder_layer.self_attn.v_proj.lora_B.default.weight, Size: 0.524288 M +Parameter: lang_encoder.base_model.model.model.layers.11.gated_cross_attn_layer.attn_gate, Size: 0.000001 M +Parameter: lang_encoder.base_model.model.model.layers.11.gated_cross_attn_layer.ff_gate, Size: 0.000001 M +Parameter: lang_encoder.base_model.model.model.layers.11.gated_cross_attn_layer.attn.norm.weight, Size: 0.004096 M +Parameter: lang_encoder.base_model.model.model.layers.11.gated_cross_attn_layer.attn.norm.bias, Size: 0.004096 M +Parameter: lang_encoder.base_model.model.model.layers.11.gated_cross_attn_layer.attn.to_q.weight, Size: 2.097152 M +Parameter: lang_encoder.base_model.model.model.layers.11.gated_cross_attn_layer.attn.to_kv.weight, Size: 1.048576 M +Parameter: lang_encoder.base_model.model.model.layers.11.gated_cross_attn_layer.attn.to_out.weight, Size: 2.097152 M +Parameter: lang_encoder.base_model.model.model.layers.11.gated_cross_attn_layer.feed_forward.0.weight, Size: 0.004096 M +Parameter: lang_encoder.base_model.model.model.layers.11.gated_cross_attn_layer.feed_forward.0.bias, Size: 0.004096 M +Parameter: lang_encoder.base_model.model.model.layers.11.gated_cross_attn_layer.feed_forward.1.weight, Size: 67.108864 M +Parameter: lang_encoder.base_model.model.model.layers.11.gated_cross_attn_layer.feed_forward.3.weight, Size: 67.108864 M +Parameter: lang_encoder.base_model.model.model.layers.11.decoder_layer.self_attn.q_proj.lora_A.default.weight, Size: 0.524288 M +Parameter: lang_encoder.base_model.model.model.layers.11.decoder_layer.self_attn.q_proj.lora_B.default.weight, Size: 0.524288 M +Parameter: lang_encoder.base_model.model.model.layers.11.decoder_layer.self_attn.v_proj.lora_A.default.weight, Size: 0.524288 M +Parameter: lang_encoder.base_model.model.model.layers.11.decoder_layer.self_attn.v_proj.lora_B.default.weight, Size: 0.524288 M +Parameter: lang_encoder.base_model.model.model.layers.12.decoder_layer.self_attn.q_proj.lora_A.default.weight, Size: 0.524288 M +Parameter: lang_encoder.base_model.model.model.layers.12.decoder_layer.self_attn.q_proj.lora_B.default.weight, Size: 0.524288 M +Parameter: lang_encoder.base_model.model.model.layers.12.decoder_layer.self_attn.v_proj.lora_A.default.weight, Size: 0.524288 M +Parameter: lang_encoder.base_model.model.model.layers.12.decoder_layer.self_attn.v_proj.lora_B.default.weight, Size: 0.524288 M +Parameter: lang_encoder.base_model.model.model.layers.13.decoder_layer.self_attn.q_proj.lora_A.default.weight, Size: 0.524288 M +Parameter: lang_encoder.base_model.model.model.layers.13.decoder_layer.self_attn.q_proj.lora_B.default.weight, Size: 0.524288 M +Parameter: lang_encoder.base_model.model.model.layers.13.decoder_layer.self_attn.v_proj.lora_A.default.weight, Size: 0.524288 M +Parameter: lang_encoder.base_model.model.model.layers.13.decoder_layer.self_attn.v_proj.lora_B.default.weight, Size: 0.524288 M +Parameter: lang_encoder.base_model.model.model.layers.14.decoder_layer.self_attn.q_proj.lora_A.default.weight, Size: 0.524288 M +Parameter: lang_encoder.base_model.model.model.layers.14.decoder_layer.self_attn.q_proj.lora_B.default.weight, Size: 0.524288 M +Parameter: lang_encoder.base_model.model.model.layers.14.decoder_layer.self_attn.v_proj.lora_A.default.weight, Size: 0.524288 M +Parameter: lang_encoder.base_model.model.model.layers.14.decoder_layer.self_attn.v_proj.lora_B.default.weight, Size: 0.524288 M +Parameter: lang_encoder.base_model.model.model.layers.15.gated_cross_attn_layer.attn_gate, Size: 0.000001 M +Parameter: lang_encoder.base_model.model.model.layers.15.gated_cross_attn_layer.ff_gate, Size: 0.000001 M +Parameter: lang_encoder.base_model.model.model.layers.15.gated_cross_attn_layer.attn.norm.weight, Size: 0.004096 M +Parameter: lang_encoder.base_model.model.model.layers.15.gated_cross_attn_layer.attn.norm.bias, Size: 0.004096 M +Parameter: lang_encoder.base_model.model.model.layers.15.gated_cross_attn_layer.attn.to_q.weight, Size: 2.097152 M +Parameter: lang_encoder.base_model.model.model.layers.15.gated_cross_attn_layer.attn.to_kv.weight, Size: 1.048576 M +Parameter: lang_encoder.base_model.model.model.layers.15.gated_cross_attn_layer.attn.to_out.weight, Size: 2.097152 M +Parameter: lang_encoder.base_model.model.model.layers.15.gated_cross_attn_layer.feed_forward.0.weight, Size: 0.004096 M +Parameter: lang_encoder.base_model.model.model.layers.15.gated_cross_attn_layer.feed_forward.0.bias, Size: 0.004096 M +Parameter: lang_encoder.base_model.model.model.layers.15.gated_cross_attn_layer.feed_forward.1.weight, Size: 67.108864 M +Parameter: lang_encoder.base_model.model.model.layers.15.gated_cross_attn_layer.feed_forward.3.weight, Size: 67.108864 M +Parameter: lang_encoder.base_model.model.model.layers.15.decoder_layer.self_attn.q_proj.lora_A.default.weight, Size: 0.524288 M +Parameter: lang_encoder.base_model.model.model.layers.15.decoder_layer.self_attn.q_proj.lora_B.default.weight, Size: 0.524288 M +Parameter: lang_encoder.base_model.model.model.layers.15.decoder_layer.self_attn.v_proj.lora_A.default.weight, Size: 0.524288 M +Parameter: lang_encoder.base_model.model.model.layers.15.decoder_layer.self_attn.v_proj.lora_B.default.weight, Size: 0.524288 M +Parameter: lang_encoder.base_model.model.model.layers.16.decoder_layer.self_attn.q_proj.lora_A.default.weight, Size: 0.524288 M +Parameter: lang_encoder.base_model.model.model.layers.16.decoder_layer.self_attn.q_proj.lora_B.default.weight, Size: 0.524288 M +Parameter: lang_encoder.base_model.model.model.layers.16.decoder_layer.self_attn.v_proj.lora_A.default.weight, Size: 0.524288 M +Parameter: lang_encoder.base_model.model.model.layers.16.decoder_layer.self_attn.v_proj.lora_B.default.weight, Size: 0.524288 M +Parameter: lang_encoder.base_model.model.model.layers.17.decoder_layer.self_attn.q_proj.lora_A.default.weight, Size: 0.524288 M +Parameter: lang_encoder.base_model.model.model.layers.17.decoder_layer.self_attn.q_proj.lora_B.default.weight, Size: 0.524288 M +Parameter: lang_encoder.base_model.model.model.layers.17.decoder_layer.self_attn.v_proj.lora_A.default.weight, Size: 0.524288 M +Parameter: lang_encoder.base_model.model.model.layers.17.decoder_layer.self_attn.v_proj.lora_B.default.weight, Size: 0.524288 M +Parameter: lang_encoder.base_model.model.model.layers.18.decoder_layer.self_attn.q_proj.lora_A.default.weight, Size: 0.524288 M +Parameter: lang_encoder.base_model.model.model.layers.18.decoder_layer.self_attn.q_proj.lora_B.default.weight, Size: 0.524288 M +Parameter: lang_encoder.base_model.model.model.layers.18.decoder_layer.self_attn.v_proj.lora_A.default.weight, Size: 0.524288 M +Parameter: lang_encoder.base_model.model.model.layers.18.decoder_layer.self_attn.v_proj.lora_B.default.weight, Size: 0.524288 M +Parameter: lang_encoder.base_model.model.model.layers.19.gated_cross_attn_layer.attn_gate, Size: 0.000001 M +Parameter: lang_encoder.base_model.model.model.layers.19.gated_cross_attn_layer.ff_gate, Size: 0.000001 M +Parameter: lang_encoder.base_model.model.model.layers.19.gated_cross_attn_layer.attn.norm.weight, Size: 0.004096 M +Parameter: lang_encoder.base_model.model.model.layers.19.gated_cross_attn_layer.attn.norm.bias, Size: 0.004096 M +Parameter: lang_encoder.base_model.model.model.layers.19.gated_cross_attn_layer.attn.to_q.weight, Size: 2.097152 M +Parameter: lang_encoder.base_model.model.model.layers.19.gated_cross_attn_layer.attn.to_kv.weight, Size: 1.048576 M +Parameter: lang_encoder.base_model.model.model.layers.19.gated_cross_attn_layer.attn.to_out.weight, Size: 2.097152 M +Parameter: lang_encoder.base_model.model.model.layers.19.gated_cross_attn_layer.feed_forward.0.weight, Size: 0.004096 M +Parameter: lang_encoder.base_model.model.model.layers.19.gated_cross_attn_layer.feed_forward.0.bias, Size: 0.004096 M +Parameter: lang_encoder.base_model.model.model.layers.19.gated_cross_attn_layer.feed_forward.1.weight, Size: 67.108864 M +Parameter: lang_encoder.base_model.model.model.layers.19.gated_cross_attn_layer.feed_forward.3.weight, Size: 67.108864 M +Parameter: lang_encoder.base_model.model.model.layers.19.decoder_layer.self_attn.q_proj.lora_A.default.weight, Size: 0.524288 M +Parameter: lang_encoder.base_model.model.model.layers.19.decoder_layer.self_attn.q_proj.lora_B.default.weight, Size: 0.524288 M +Parameter: lang_encoder.base_model.model.model.layers.19.decoder_layer.self_attn.v_proj.lora_A.default.weight, Size: 0.524288 M +Parameter: lang_encoder.base_model.model.model.layers.19.decoder_layer.self_attn.v_proj.lora_B.default.weight, Size: 0.524288 M +Parameter: lang_encoder.base_model.model.model.layers.20.decoder_layer.self_attn.q_proj.lora_A.default.weight, Size: 0.524288 M +Parameter: lang_encoder.base_model.model.model.layers.20.decoder_layer.self_attn.q_proj.lora_B.default.weight, Size: 0.524288 M +Parameter: lang_encoder.base_model.model.model.layers.20.decoder_layer.self_attn.v_proj.lora_A.default.weight, Size: 0.524288 M +Parameter: lang_encoder.base_model.model.model.layers.20.decoder_layer.self_attn.v_proj.lora_B.default.weight, Size: 0.524288 M +Parameter: lang_encoder.base_model.model.model.layers.21.decoder_layer.self_attn.q_proj.lora_A.default.weight, Size: 0.524288 M +Parameter: lang_encoder.base_model.model.model.layers.21.decoder_layer.self_attn.q_proj.lora_B.default.weight, Size: 0.524288 M +Parameter: lang_encoder.base_model.model.model.layers.21.decoder_layer.self_attn.v_proj.lora_A.default.weight, Size: 0.524288 M +Parameter: lang_encoder.base_model.model.model.layers.21.decoder_layer.self_attn.v_proj.lora_B.default.weight, Size: 0.524288 M +Parameter: lang_encoder.base_model.model.model.layers.22.decoder_layer.self_attn.q_proj.lora_A.default.weight, Size: 0.524288 M +Parameter: lang_encoder.base_model.model.model.layers.22.decoder_layer.self_attn.q_proj.lora_B.default.weight, Size: 0.524288 M +Parameter: lang_encoder.base_model.model.model.layers.22.decoder_layer.self_attn.v_proj.lora_A.default.weight, Size: 0.524288 M +Parameter: lang_encoder.base_model.model.model.layers.22.decoder_layer.self_attn.v_proj.lora_B.default.weight, Size: 0.524288 M +Parameter: lang_encoder.base_model.model.model.layers.23.gated_cross_attn_layer.attn_gate, Size: 0.000001 M +Parameter: lang_encoder.base_model.model.model.layers.23.gated_cross_attn_layer.ff_gate, Size: 0.000001 M +Parameter: lang_encoder.base_model.model.model.layers.23.gated_cross_attn_layer.attn.norm.weight, Size: 0.004096 M +Parameter: lang_encoder.base_model.model.model.layers.23.gated_cross_attn_layer.attn.norm.bias, Size: 0.004096 M +Parameter: lang_encoder.base_model.model.model.layers.23.gated_cross_attn_layer.attn.to_q.weight, Size: 2.097152 M +Parameter: lang_encoder.base_model.model.model.layers.23.gated_cross_attn_layer.attn.to_kv.weight, Size: 1.048576 M +Parameter: lang_encoder.base_model.model.model.layers.23.gated_cross_attn_layer.attn.to_out.weight, Size: 2.097152 M +Parameter: lang_encoder.base_model.model.model.layers.23.gated_cross_attn_layer.feed_forward.0.weight, Size: 0.004096 M +Parameter: lang_encoder.base_model.model.model.layers.23.gated_cross_attn_layer.feed_forward.0.bias, Size: 0.004096 M +Parameter: lang_encoder.base_model.model.model.layers.23.gated_cross_attn_layer.feed_forward.1.weight, Size: 67.108864 M +Parameter: lang_encoder.base_model.model.model.layers.23.gated_cross_attn_layer.feed_forward.3.weight, Size: 67.108864 M +Parameter: lang_encoder.base_model.model.model.layers.23.decoder_layer.self_attn.q_proj.lora_A.default.weight, Size: 0.524288 M +Parameter: lang_encoder.base_model.model.model.layers.23.decoder_layer.self_attn.q_proj.lora_B.default.weight, Size: 0.524288 M +Parameter: lang_encoder.base_model.model.model.layers.23.decoder_layer.self_attn.v_proj.lora_A.default.weight, Size: 0.524288 M +Parameter: lang_encoder.base_model.model.model.layers.23.decoder_layer.self_attn.v_proj.lora_B.default.weight, Size: 0.524288 M +Parameter: lang_encoder.base_model.model.model.layers.24.decoder_layer.self_attn.q_proj.lora_A.default.weight, Size: 0.524288 M +Parameter: lang_encoder.base_model.model.model.layers.24.decoder_layer.self_attn.q_proj.lora_B.default.weight, Size: 0.524288 M +Parameter: lang_encoder.base_model.model.model.layers.24.decoder_layer.self_attn.v_proj.lora_A.default.weight, Size: 0.524288 M +Parameter: lang_encoder.base_model.model.model.layers.24.decoder_layer.self_attn.v_proj.lora_B.default.weight, Size: 0.524288 M +Parameter: lang_encoder.base_model.model.model.layers.25.decoder_layer.self_attn.q_proj.lora_A.default.weight, Size: 0.524288 M +Parameter: lang_encoder.base_model.model.model.layers.25.decoder_layer.self_attn.q_proj.lora_B.default.weight, Size: 0.524288 M +Parameter: lang_encoder.base_model.model.model.layers.25.decoder_layer.self_attn.v_proj.lora_A.default.weight, Size: 0.524288 M +Parameter: lang_encoder.base_model.model.model.layers.25.decoder_layer.self_attn.v_proj.lora_B.default.weight, Size: 0.524288 M +Parameter: lang_encoder.base_model.model.model.layers.26.decoder_layer.self_attn.q_proj.lora_A.default.weight, Size: 0.524288 M +Parameter: lang_encoder.base_model.model.model.layers.26.decoder_layer.self_attn.q_proj.lora_B.default.weight, Size: 0.524288 M +Parameter: lang_encoder.base_model.model.model.layers.26.decoder_layer.self_attn.v_proj.lora_A.default.weight, Size: 0.524288 M +Parameter: lang_encoder.base_model.model.model.layers.26.decoder_layer.self_attn.v_proj.lora_B.default.weight, Size: 0.524288 M +Parameter: lang_encoder.base_model.model.model.layers.27.gated_cross_attn_layer.attn_gate, Size: 0.000001 M +Parameter: lang_encoder.base_model.model.model.layers.27.gated_cross_attn_layer.ff_gate, Size: 0.000001 M +Parameter: lang_encoder.base_model.model.model.layers.27.gated_cross_attn_layer.attn.norm.weight, Size: 0.004096 M +Parameter: lang_encoder.base_model.model.model.layers.27.gated_cross_attn_layer.attn.norm.bias, Size: 0.004096 M +Parameter: lang_encoder.base_model.model.model.layers.27.gated_cross_attn_layer.attn.to_q.weight, Size: 2.097152 M +Parameter: lang_encoder.base_model.model.model.layers.27.gated_cross_attn_layer.attn.to_kv.weight, Size: 1.048576 M +Parameter: lang_encoder.base_model.model.model.layers.27.gated_cross_attn_layer.attn.to_out.weight, Size: 2.097152 M +Parameter: lang_encoder.base_model.model.model.layers.27.gated_cross_attn_layer.feed_forward.0.weight, Size: 0.004096 M +Parameter: lang_encoder.base_model.model.model.layers.27.gated_cross_attn_layer.feed_forward.0.bias, Size: 0.004096 M +Parameter: lang_encoder.base_model.model.model.layers.27.gated_cross_attn_layer.feed_forward.1.weight, Size: 67.108864 M +Parameter: lang_encoder.base_model.model.model.layers.27.gated_cross_attn_layer.feed_forward.3.weight, Size: 67.108864 M +Parameter: lang_encoder.base_model.model.model.layers.27.decoder_layer.self_attn.q_proj.lora_A.default.weight, Size: 0.524288 M +Parameter: lang_encoder.base_model.model.model.layers.27.decoder_layer.self_attn.q_proj.lora_B.default.weight, Size: 0.524288 M +Parameter: lang_encoder.base_model.model.model.layers.27.decoder_layer.self_attn.v_proj.lora_A.default.weight, Size: 0.524288 M +Parameter: lang_encoder.base_model.model.model.layers.27.decoder_layer.self_attn.v_proj.lora_B.default.weight, Size: 0.524288 M +Parameter: lang_encoder.base_model.model.model.layers.28.decoder_layer.self_attn.q_proj.lora_A.default.weight, Size: 0.524288 M +Parameter: lang_encoder.base_model.model.model.layers.28.decoder_layer.self_attn.q_proj.lora_B.default.weight, Size: 0.524288 M +Parameter: lang_encoder.base_model.model.model.layers.28.decoder_layer.self_attn.v_proj.lora_A.default.weight, Size: 0.524288 M +Parameter: lang_encoder.base_model.model.model.layers.28.decoder_layer.self_attn.v_proj.lora_B.default.weight, Size: 0.524288 M +Parameter: lang_encoder.base_model.model.model.layers.29.decoder_layer.self_attn.q_proj.lora_A.default.weight, Size: 0.524288 M +Parameter: lang_encoder.base_model.model.model.layers.29.decoder_layer.self_attn.q_proj.lora_B.default.weight, Size: 0.524288 M +Parameter: lang_encoder.base_model.model.model.layers.29.decoder_layer.self_attn.v_proj.lora_A.default.weight, Size: 0.524288 M +Parameter: lang_encoder.base_model.model.model.layers.29.decoder_layer.self_attn.v_proj.lora_B.default.weight, Size: 0.524288 M +Parameter: lang_encoder.base_model.model.model.layers.30.decoder_layer.self_attn.q_proj.lora_A.default.weight, Size: 0.524288 M +Parameter: lang_encoder.base_model.model.model.layers.30.decoder_layer.self_attn.q_proj.lora_B.default.weight, Size: 0.524288 M +Parameter: lang_encoder.base_model.model.model.layers.30.decoder_layer.self_attn.v_proj.lora_A.default.weight, Size: 0.524288 M +Parameter: lang_encoder.base_model.model.model.layers.30.decoder_layer.self_attn.v_proj.lora_B.default.weight, Size: 0.524288 M +Parameter: lang_encoder.base_model.model.model.layers.31.gated_cross_attn_layer.attn_gate, Size: 0.000001 M +Parameter: lang_encoder.base_model.model.model.layers.31.gated_cross_attn_layer.ff_gate, Size: 0.000001 M +Parameter: lang_encoder.base_model.model.model.layers.31.gated_cross_attn_layer.attn.norm.weight, Size: 0.004096 M +Parameter: lang_encoder.base_model.model.model.layers.31.gated_cross_attn_layer.attn.norm.bias, Size: 0.004096 M +Parameter: lang_encoder.base_model.model.model.layers.31.gated_cross_attn_layer.attn.to_q.weight, Size: 2.097152 M +Parameter: lang_encoder.base_model.model.model.layers.31.gated_cross_attn_layer.attn.to_kv.weight, Size: 1.048576 M +Parameter: lang_encoder.base_model.model.model.layers.31.gated_cross_attn_layer.attn.to_out.weight, Size: 2.097152 M +Parameter: lang_encoder.base_model.model.model.layers.31.gated_cross_attn_layer.feed_forward.0.weight, Size: 0.004096 M +Parameter: lang_encoder.base_model.model.model.layers.31.gated_cross_attn_layer.feed_forward.0.bias, Size: 0.004096 M +Parameter: lang_encoder.base_model.model.model.layers.31.gated_cross_attn_layer.feed_forward.1.weight, Size: 67.108864 M +Parameter: lang_encoder.base_model.model.model.layers.31.gated_cross_attn_layer.feed_forward.3.weight, Size: 67.108864 M +Parameter: lang_encoder.base_model.model.model.layers.31.decoder_layer.self_attn.q_proj.lora_A.default.weight, Size: 0.524288 M +Parameter: lang_encoder.base_model.model.model.layers.31.decoder_layer.self_attn.q_proj.lora_B.default.weight, Size: 0.524288 M +Parameter: lang_encoder.base_model.model.model.layers.31.decoder_layer.self_attn.v_proj.lora_A.default.weight, Size: 0.524288 M +Parameter: lang_encoder.base_model.model.model.layers.31.decoder_layer.self_attn.v_proj.lora_B.default.weight, Size: 0.524288 M +Parameter: lang_encoder.base_model.model.lm_head.weight, Size: 131.084288 M +Parameter: perceiver.latents, Size: 0.065536 M +Parameter: perceiver.layers.0.norm_media.weight, Size: 0.001024 M +Parameter: perceiver.layers.0.norm_media.bias, Size: 0.001024 M +Parameter: perceiver.layers.0.norm_latents.weight, Size: 0.001024 M +Parameter: perceiver.layers.0.norm_latents.bias, Size: 0.001024 M +Parameter: perceiver.layers.0.to_q.weight, Size: 0.524288 M +Parameter: perceiver.layers.0.to_kv.weight, Size: 1.048576 M +Parameter: perceiver.layers.0.to_out.weight, Size: 0.524288 M +Parameter: perceiver.layers.0.feed_forward.0.weight, Size: 0.001024 M +Parameter: perceiver.layers.0.feed_forward.0.bias, Size: 0.001024 M +Parameter: perceiver.layers.0.feed_forward.1.weight, Size: 4.194304 M +Parameter: perceiver.layers.0.feed_forward.3.weight, Size: 4.194304 M +Parameter: perceiver.layers.1.norm_media.weight, Size: 0.001024 M +Parameter: perceiver.layers.1.norm_media.bias, Size: 0.001024 M +Parameter: perceiver.layers.1.norm_latents.weight, Size: 0.001024 M +Parameter: perceiver.layers.1.norm_latents.bias, Size: 0.001024 M +Parameter: perceiver.layers.1.to_q.weight, Size: 0.524288 M +Parameter: perceiver.layers.1.to_kv.weight, Size: 1.048576 M +Parameter: perceiver.layers.1.to_out.weight, Size: 0.524288 M +Parameter: perceiver.layers.1.feed_forward.0.weight, Size: 0.001024 M +Parameter: perceiver.layers.1.feed_forward.0.bias, Size: 0.001024 M +Parameter: perceiver.layers.1.feed_forward.1.weight, Size: 4.194304 M +Parameter: perceiver.layers.1.feed_forward.3.weight, Size: 4.194304 M +Parameter: perceiver.layers.2.norm_media.weight, Size: 0.001024 M +Parameter: perceiver.layers.2.norm_media.bias, Size: 0.001024 M +Parameter: perceiver.layers.2.norm_latents.weight, Size: 0.001024 M +Parameter: perceiver.layers.2.norm_latents.bias, Size: 0.001024 M +Parameter: perceiver.layers.2.to_q.weight, Size: 0.524288 M +Parameter: perceiver.layers.2.to_kv.weight, Size: 1.048576 M +Parameter: perceiver.layers.2.to_out.weight, Size: 0.524288 M +Parameter: perceiver.layers.2.feed_forward.0.weight, Size: 0.001024 M +Parameter: perceiver.layers.2.feed_forward.0.bias, Size: 0.001024 M +Parameter: perceiver.layers.2.feed_forward.1.weight, Size: 4.194304 M +Parameter: perceiver.layers.2.feed_forward.3.weight, Size: 4.194304 M +Parameter: perceiver.layers.3.norm_media.weight, Size: 0.001024 M +Parameter: perceiver.layers.3.norm_media.bias, Size: 0.001024 M +Parameter: perceiver.layers.3.norm_latents.weight, Size: 0.001024 M +Parameter: perceiver.layers.3.norm_latents.bias, Size: 0.001024 M +Parameter: perceiver.layers.3.to_q.weight, Size: 0.524288 M +Parameter: perceiver.layers.3.to_kv.weight, Size: 1.048576 M +Parameter: perceiver.layers.3.to_out.weight, Size: 0.524288 M +Parameter: perceiver.layers.3.feed_forward.0.weight, Size: 0.001024 M +Parameter: perceiver.layers.3.feed_forward.0.bias, Size: 0.001024 M +Parameter: perceiver.layers.3.feed_forward.1.weight, Size: 4.194304 M +Parameter: perceiver.layers.3.feed_forward.3.weight, Size: 4.194304 M +Parameter: perceiver.layers.4.norm_media.weight, Size: 0.001024 M +Parameter: perceiver.layers.4.norm_media.bias, Size: 0.001024 M +Parameter: perceiver.layers.4.norm_latents.weight, Size: 0.001024 M +Parameter: perceiver.layers.4.norm_latents.bias, Size: 0.001024 M +Parameter: perceiver.layers.4.to_q.weight, Size: 0.524288 M +Parameter: perceiver.layers.4.to_kv.weight, Size: 1.048576 M +Parameter: perceiver.layers.4.to_out.weight, Size: 0.524288 M +Parameter: perceiver.layers.4.feed_forward.0.weight, Size: 0.001024 M +Parameter: perceiver.layers.4.feed_forward.0.bias, Size: 0.001024 M +Parameter: perceiver.layers.4.feed_forward.1.weight, Size: 4.194304 M +Parameter: perceiver.layers.4.feed_forward.3.weight, Size: 4.194304 M +Parameter: perceiver.layers.5.norm_media.weight, Size: 0.001024 M +Parameter: perceiver.layers.5.norm_media.bias, Size: 0.001024 M +Parameter: perceiver.layers.5.norm_latents.weight, Size: 0.001024 M +Parameter: perceiver.layers.5.norm_latents.bias, Size: 0.001024 M +Parameter: perceiver.layers.5.to_q.weight, Size: 0.524288 M +Parameter: perceiver.layers.5.to_kv.weight, Size: 1.048576 M +Parameter: perceiver.layers.5.to_out.weight, Size: 0.524288 M +Parameter: perceiver.layers.5.feed_forward.0.weight, Size: 0.001024 M +Parameter: perceiver.layers.5.feed_forward.0.bias, Size: 0.001024 M +Parameter: perceiver.layers.5.feed_forward.1.weight, Size: 4.194304 M +Parameter: perceiver.layers.5.feed_forward.3.weight, Size: 4.194304 M +Parameter: perceiver.norm.weight, Size: 0.001024 M +Parameter: perceiver.norm.bias, Size: 0.001024 M +Total Trainable param: 1.508112 B +Imported class: + +DATASET: MMEDataset +-------------------- +=========== Cognition =========== +total score: 218.57142857142856 + code_reasoning score: 50.0 + numerical_calculation score: 50.0 + text_translation score: 50.0 + commonsense_reasoning score: 68.57142857142857 +=========== Perception =========== +total score: 769.2145858343337 + artwork score: 53.24999999999999 + celebrity score: 54.11764705882353 + count score: 63.333333333333336 + color score: 50.0 + position score: 48.333333333333336 + OCR score: 50.0 + landmark score: 106.25 + scene score: 133.25 + existence score: 160.0 + posters score: 50.68027210884354 + +-------------------------------------------------------------------------------- +Total Datasets Evaluated: 1 + +================================================================================ diff --git a/eval_results/eval_results_otter_lr1e-3_3epochs b/eval_results/eval_results_otter_lr1e-3_3epochs new file mode 100644 index 00000000..41cd5860 --- /dev/null +++ b/eval_results/eval_results_otter_lr1e-3_3epochs @@ -0,0 +1,197 @@ +================================================================================ + EVALUATION REPORT +================================================================================ + + +MODEL INFO: {'name': 'otter_image', 'model_path': '/mnt/petrelfs/zhangyuanhan/Otter/checkpoints/otter_llava_lr1e-3_3epochs_sft_nonconv_nogroup/epoch_1/'} +-------------------------------------------------------------------------------- +[2023-12-23 08:13:17,507] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect) +Imported class: +The current model version is configured for Otter-Image with max_num_frames set to None. +Parameter: lang_encoder.model.embed_tokens.weight, Size: 131.084288 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.attn_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.ff_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.attn.norm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.attn.norm.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.attn.to_q.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.attn.to_kv.weight, Size: 1.048576 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.attn.to_out.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.feed_forward.0.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.feed_forward.0.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.feed_forward.1.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.feed_forward.3.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.attn_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.ff_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.attn.norm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.attn.norm.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.attn.to_q.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.attn.to_kv.weight, Size: 1.048576 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.attn.to_out.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.feed_forward.0.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.feed_forward.0.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.feed_forward.1.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.feed_forward.3.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.attn_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.ff_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.attn.norm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.attn.norm.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.attn.to_q.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.attn.to_kv.weight, Size: 1.048576 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.attn.to_out.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.feed_forward.0.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.feed_forward.0.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.feed_forward.1.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.feed_forward.3.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.attn_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.ff_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.attn.norm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.attn.norm.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.attn.to_q.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.attn.to_kv.weight, Size: 1.048576 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.attn.to_out.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.feed_forward.0.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.feed_forward.0.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.feed_forward.1.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.feed_forward.3.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.attn_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.ff_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.attn.norm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.attn.norm.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.attn.to_q.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.attn.to_kv.weight, Size: 1.048576 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.attn.to_out.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.feed_forward.0.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.feed_forward.0.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.feed_forward.1.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.feed_forward.3.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.attn_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.ff_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.attn.norm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.attn.norm.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.attn.to_q.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.attn.to_kv.weight, Size: 1.048576 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.attn.to_out.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.feed_forward.0.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.feed_forward.0.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.feed_forward.1.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.feed_forward.3.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.attn_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.ff_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.attn.norm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.attn.norm.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.attn.to_q.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.attn.to_kv.weight, Size: 1.048576 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.attn.to_out.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.feed_forward.0.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.feed_forward.0.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.feed_forward.1.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.feed_forward.3.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.attn_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.ff_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.attn.norm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.attn.norm.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.attn.to_q.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.attn.to_kv.weight, Size: 1.048576 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.attn.to_out.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.feed_forward.0.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.feed_forward.0.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.feed_forward.1.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.feed_forward.3.weight, Size: 67.108864 M +Parameter: lang_encoder.lm_head.weight, Size: 131.084288 M +Parameter: perceiver.latents, Size: 0.065536 M +Parameter: perceiver.layers.0.norm_media.weight, Size: 0.001024 M +Parameter: perceiver.layers.0.norm_media.bias, Size: 0.001024 M +Parameter: perceiver.layers.0.norm_latents.weight, Size: 0.001024 M +Parameter: perceiver.layers.0.norm_latents.bias, Size: 0.001024 M +Parameter: perceiver.layers.0.to_q.weight, Size: 0.524288 M +Parameter: perceiver.layers.0.to_kv.weight, Size: 1.048576 M +Parameter: perceiver.layers.0.to_out.weight, Size: 0.524288 M +Parameter: perceiver.layers.0.feed_forward.0.weight, Size: 0.001024 M +Parameter: perceiver.layers.0.feed_forward.0.bias, Size: 0.001024 M +Parameter: perceiver.layers.0.feed_forward.1.weight, Size: 4.194304 M +Parameter: perceiver.layers.0.feed_forward.3.weight, Size: 4.194304 M +Parameter: perceiver.layers.1.norm_media.weight, Size: 0.001024 M +Parameter: perceiver.layers.1.norm_media.bias, Size: 0.001024 M +Parameter: perceiver.layers.1.norm_latents.weight, Size: 0.001024 M +Parameter: perceiver.layers.1.norm_latents.bias, Size: 0.001024 M +Parameter: perceiver.layers.1.to_q.weight, Size: 0.524288 M +Parameter: perceiver.layers.1.to_kv.weight, Size: 1.048576 M +Parameter: perceiver.layers.1.to_out.weight, Size: 0.524288 M +Parameter: perceiver.layers.1.feed_forward.0.weight, Size: 0.001024 M +Parameter: perceiver.layers.1.feed_forward.0.bias, Size: 0.001024 M +Parameter: perceiver.layers.1.feed_forward.1.weight, Size: 4.194304 M +Parameter: perceiver.layers.1.feed_forward.3.weight, Size: 4.194304 M +Parameter: perceiver.layers.2.norm_media.weight, Size: 0.001024 M +Parameter: perceiver.layers.2.norm_media.bias, Size: 0.001024 M +Parameter: perceiver.layers.2.norm_latents.weight, Size: 0.001024 M +Parameter: perceiver.layers.2.norm_latents.bias, Size: 0.001024 M +Parameter: perceiver.layers.2.to_q.weight, Size: 0.524288 M +Parameter: perceiver.layers.2.to_kv.weight, Size: 1.048576 M +Parameter: perceiver.layers.2.to_out.weight, Size: 0.524288 M +Parameter: perceiver.layers.2.feed_forward.0.weight, Size: 0.001024 M +Parameter: perceiver.layers.2.feed_forward.0.bias, Size: 0.001024 M +Parameter: perceiver.layers.2.feed_forward.1.weight, Size: 4.194304 M +Parameter: perceiver.layers.2.feed_forward.3.weight, Size: 4.194304 M +Parameter: perceiver.layers.3.norm_media.weight, Size: 0.001024 M +Parameter: perceiver.layers.3.norm_media.bias, Size: 0.001024 M +Parameter: perceiver.layers.3.norm_latents.weight, Size: 0.001024 M +Parameter: perceiver.layers.3.norm_latents.bias, Size: 0.001024 M +Parameter: perceiver.layers.3.to_q.weight, Size: 0.524288 M +Parameter: perceiver.layers.3.to_kv.weight, Size: 1.048576 M +Parameter: perceiver.layers.3.to_out.weight, Size: 0.524288 M +Parameter: perceiver.layers.3.feed_forward.0.weight, Size: 0.001024 M +Parameter: perceiver.layers.3.feed_forward.0.bias, Size: 0.001024 M +Parameter: perceiver.layers.3.feed_forward.1.weight, Size: 4.194304 M +Parameter: perceiver.layers.3.feed_forward.3.weight, Size: 4.194304 M +Parameter: perceiver.layers.4.norm_media.weight, Size: 0.001024 M +Parameter: perceiver.layers.4.norm_media.bias, Size: 0.001024 M +Parameter: perceiver.layers.4.norm_latents.weight, Size: 0.001024 M +Parameter: perceiver.layers.4.norm_latents.bias, Size: 0.001024 M +Parameter: perceiver.layers.4.to_q.weight, Size: 0.524288 M +Parameter: perceiver.layers.4.to_kv.weight, Size: 1.048576 M +Parameter: perceiver.layers.4.to_out.weight, Size: 0.524288 M +Parameter: perceiver.layers.4.feed_forward.0.weight, Size: 0.001024 M +Parameter: perceiver.layers.4.feed_forward.0.bias, Size: 0.001024 M +Parameter: perceiver.layers.4.feed_forward.1.weight, Size: 4.194304 M +Parameter: perceiver.layers.4.feed_forward.3.weight, Size: 4.194304 M +Parameter: perceiver.layers.5.norm_media.weight, Size: 0.001024 M +Parameter: perceiver.layers.5.norm_media.bias, Size: 0.001024 M +Parameter: perceiver.layers.5.norm_latents.weight, Size: 0.001024 M +Parameter: perceiver.layers.5.norm_latents.bias, Size: 0.001024 M +Parameter: perceiver.layers.5.to_q.weight, Size: 0.524288 M +Parameter: perceiver.layers.5.to_kv.weight, Size: 1.048576 M +Parameter: perceiver.layers.5.to_out.weight, Size: 0.524288 M +Parameter: perceiver.layers.5.feed_forward.0.weight, Size: 0.001024 M +Parameter: perceiver.layers.5.feed_forward.0.bias, Size: 0.001024 M +Parameter: perceiver.layers.5.feed_forward.1.weight, Size: 4.194304 M +Parameter: perceiver.layers.5.feed_forward.3.weight, Size: 4.194304 M +Parameter: perceiver.norm.weight, Size: 0.001024 M +Parameter: perceiver.norm.bias, Size: 0.001024 M +Total Trainable param: 1.441004 B +Imported class: + +DATASET: MMEDataset +-------------------- +=========== Cognition =========== +total score: 221.42857142857144 + code_reasoning score: 50.0 + numerical_calculation score: 50.0 + text_translation score: 50.0 + commonsense_reasoning score: 71.42857142857143 +=========== Perception =========== +total score: 716.3894557823129 + artwork score: 65.5 + celebrity score: 55.0 + count score: 86.66666666666666 + color score: 65.0 + position score: 50.0 + OCR score: 50.0 + landmark score: 75.5 + scene score: 96.75 + existence score: 123.33333333333333 + posters score: 48.63945578231292 + +-------------------------------------------------------------------------------- +Total Datasets Evaluated: 1 + +================================================================================ diff --git a/eval_results/eval_results_otter_new b/eval_results/eval_results_otter_new new file mode 100644 index 00000000..0d345515 --- /dev/null +++ b/eval_results/eval_results_otter_new @@ -0,0 +1,197 @@ +================================================================================ + EVALUATION REPORT +================================================================================ + + +MODEL INFO: {'name': 'otter_image', 'model_path': '/mnt/petrelfs/zhangyuanhan/Otter/checkpoints/otter_llava_sft_nonconv_nogroup/epoch_1/'} +-------------------------------------------------------------------------------- +[2023-12-25 21:19:56,950] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect) +Imported class: +The current model version is configured for Otter-Image with max_num_frames set to None. +Parameter: lang_encoder.model.embed_tokens.weight, Size: 131.084288 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.attn_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.ff_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.attn.norm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.attn.norm.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.attn.to_q.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.attn.to_kv.weight, Size: 1.048576 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.attn.to_out.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.feed_forward.0.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.feed_forward.0.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.feed_forward.1.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.feed_forward.3.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.attn_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.ff_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.attn.norm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.attn.norm.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.attn.to_q.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.attn.to_kv.weight, Size: 1.048576 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.attn.to_out.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.feed_forward.0.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.feed_forward.0.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.feed_forward.1.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.feed_forward.3.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.attn_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.ff_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.attn.norm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.attn.norm.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.attn.to_q.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.attn.to_kv.weight, Size: 1.048576 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.attn.to_out.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.feed_forward.0.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.feed_forward.0.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.feed_forward.1.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.feed_forward.3.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.attn_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.ff_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.attn.norm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.attn.norm.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.attn.to_q.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.attn.to_kv.weight, Size: 1.048576 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.attn.to_out.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.feed_forward.0.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.feed_forward.0.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.feed_forward.1.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.feed_forward.3.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.attn_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.ff_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.attn.norm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.attn.norm.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.attn.to_q.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.attn.to_kv.weight, Size: 1.048576 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.attn.to_out.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.feed_forward.0.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.feed_forward.0.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.feed_forward.1.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.feed_forward.3.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.attn_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.ff_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.attn.norm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.attn.norm.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.attn.to_q.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.attn.to_kv.weight, Size: 1.048576 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.attn.to_out.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.feed_forward.0.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.feed_forward.0.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.feed_forward.1.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.feed_forward.3.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.attn_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.ff_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.attn.norm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.attn.norm.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.attn.to_q.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.attn.to_kv.weight, Size: 1.048576 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.attn.to_out.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.feed_forward.0.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.feed_forward.0.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.feed_forward.1.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.feed_forward.3.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.attn_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.ff_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.attn.norm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.attn.norm.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.attn.to_q.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.attn.to_kv.weight, Size: 1.048576 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.attn.to_out.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.feed_forward.0.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.feed_forward.0.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.feed_forward.1.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.feed_forward.3.weight, Size: 67.108864 M +Parameter: lang_encoder.lm_head.weight, Size: 131.084288 M +Parameter: perceiver.latents, Size: 0.065536 M +Parameter: perceiver.layers.0.norm_media.weight, Size: 0.001024 M +Parameter: perceiver.layers.0.norm_media.bias, Size: 0.001024 M +Parameter: perceiver.layers.0.norm_latents.weight, Size: 0.001024 M +Parameter: perceiver.layers.0.norm_latents.bias, Size: 0.001024 M +Parameter: perceiver.layers.0.to_q.weight, Size: 0.524288 M +Parameter: perceiver.layers.0.to_kv.weight, Size: 1.048576 M +Parameter: perceiver.layers.0.to_out.weight, Size: 0.524288 M +Parameter: perceiver.layers.0.feed_forward.0.weight, Size: 0.001024 M +Parameter: perceiver.layers.0.feed_forward.0.bias, Size: 0.001024 M +Parameter: perceiver.layers.0.feed_forward.1.weight, Size: 4.194304 M +Parameter: perceiver.layers.0.feed_forward.3.weight, Size: 4.194304 M +Parameter: perceiver.layers.1.norm_media.weight, Size: 0.001024 M +Parameter: perceiver.layers.1.norm_media.bias, Size: 0.001024 M +Parameter: perceiver.layers.1.norm_latents.weight, Size: 0.001024 M +Parameter: perceiver.layers.1.norm_latents.bias, Size: 0.001024 M +Parameter: perceiver.layers.1.to_q.weight, Size: 0.524288 M +Parameter: perceiver.layers.1.to_kv.weight, Size: 1.048576 M +Parameter: perceiver.layers.1.to_out.weight, Size: 0.524288 M +Parameter: perceiver.layers.1.feed_forward.0.weight, Size: 0.001024 M +Parameter: perceiver.layers.1.feed_forward.0.bias, Size: 0.001024 M +Parameter: perceiver.layers.1.feed_forward.1.weight, Size: 4.194304 M +Parameter: perceiver.layers.1.feed_forward.3.weight, Size: 4.194304 M +Parameter: perceiver.layers.2.norm_media.weight, Size: 0.001024 M +Parameter: perceiver.layers.2.norm_media.bias, Size: 0.001024 M +Parameter: perceiver.layers.2.norm_latents.weight, Size: 0.001024 M +Parameter: perceiver.layers.2.norm_latents.bias, Size: 0.001024 M +Parameter: perceiver.layers.2.to_q.weight, Size: 0.524288 M +Parameter: perceiver.layers.2.to_kv.weight, Size: 1.048576 M +Parameter: perceiver.layers.2.to_out.weight, Size: 0.524288 M +Parameter: perceiver.layers.2.feed_forward.0.weight, Size: 0.001024 M +Parameter: perceiver.layers.2.feed_forward.0.bias, Size: 0.001024 M +Parameter: perceiver.layers.2.feed_forward.1.weight, Size: 4.194304 M +Parameter: perceiver.layers.2.feed_forward.3.weight, Size: 4.194304 M +Parameter: perceiver.layers.3.norm_media.weight, Size: 0.001024 M +Parameter: perceiver.layers.3.norm_media.bias, Size: 0.001024 M +Parameter: perceiver.layers.3.norm_latents.weight, Size: 0.001024 M +Parameter: perceiver.layers.3.norm_latents.bias, Size: 0.001024 M +Parameter: perceiver.layers.3.to_q.weight, Size: 0.524288 M +Parameter: perceiver.layers.3.to_kv.weight, Size: 1.048576 M +Parameter: perceiver.layers.3.to_out.weight, Size: 0.524288 M +Parameter: perceiver.layers.3.feed_forward.0.weight, Size: 0.001024 M +Parameter: perceiver.layers.3.feed_forward.0.bias, Size: 0.001024 M +Parameter: perceiver.layers.3.feed_forward.1.weight, Size: 4.194304 M +Parameter: perceiver.layers.3.feed_forward.3.weight, Size: 4.194304 M +Parameter: perceiver.layers.4.norm_media.weight, Size: 0.001024 M +Parameter: perceiver.layers.4.norm_media.bias, Size: 0.001024 M +Parameter: perceiver.layers.4.norm_latents.weight, Size: 0.001024 M +Parameter: perceiver.layers.4.norm_latents.bias, Size: 0.001024 M +Parameter: perceiver.layers.4.to_q.weight, Size: 0.524288 M +Parameter: perceiver.layers.4.to_kv.weight, Size: 1.048576 M +Parameter: perceiver.layers.4.to_out.weight, Size: 0.524288 M +Parameter: perceiver.layers.4.feed_forward.0.weight, Size: 0.001024 M +Parameter: perceiver.layers.4.feed_forward.0.bias, Size: 0.001024 M +Parameter: perceiver.layers.4.feed_forward.1.weight, Size: 4.194304 M +Parameter: perceiver.layers.4.feed_forward.3.weight, Size: 4.194304 M +Parameter: perceiver.layers.5.norm_media.weight, Size: 0.001024 M +Parameter: perceiver.layers.5.norm_media.bias, Size: 0.001024 M +Parameter: perceiver.layers.5.norm_latents.weight, Size: 0.001024 M +Parameter: perceiver.layers.5.norm_latents.bias, Size: 0.001024 M +Parameter: perceiver.layers.5.to_q.weight, Size: 0.524288 M +Parameter: perceiver.layers.5.to_kv.weight, Size: 1.048576 M +Parameter: perceiver.layers.5.to_out.weight, Size: 0.524288 M +Parameter: perceiver.layers.5.feed_forward.0.weight, Size: 0.001024 M +Parameter: perceiver.layers.5.feed_forward.0.bias, Size: 0.001024 M +Parameter: perceiver.layers.5.feed_forward.1.weight, Size: 4.194304 M +Parameter: perceiver.layers.5.feed_forward.3.weight, Size: 4.194304 M +Parameter: perceiver.norm.weight, Size: 0.001024 M +Parameter: perceiver.norm.bias, Size: 0.001024 M +Total Trainable param: 1.441004 B +Imported class: + +DATASET: MMEDataset +-------------------- +=========== Cognition =========== +total score: 293.57142857142856 + code_reasoning score: 70.0 + numerical_calculation score: 55.0 + text_translation score: 75.0 + commonsense_reasoning score: 93.57142857142856 +=========== Perception =========== +total score: 924.3667466986794 + artwork score: 70.5 + celebrity score: 86.76470588235294 + count score: 110.0 + color score: 60.0 + position score: 46.666666666666664 + OCR score: 70.0 + landmark score: 106.25 + scene score: 150.75 + existence score: 168.33333333333331 + posters score: 55.10204081632653 + +-------------------------------------------------------------------------------- +Total Datasets Evaluated: 1 + +================================================================================ diff --git a/eval_results/eval_results_otter_pretrain_lr1e-3_resampler256_only_input b/eval_results/eval_results_otter_pretrain_lr1e-3_resampler256_only_input new file mode 100644 index 00000000..c017f47d --- /dev/null +++ b/eval_results/eval_results_otter_pretrain_lr1e-3_resampler256_only_input @@ -0,0 +1,104 @@ +================================================================================ + EVALUATION REPORT +================================================================================ + + +MODEL INFO: {'name': 'otter_image', 'model_path': '/mnt/petrelfs/zhangyuanhan/Otter/checkpoints/otter_llava_lr1e-3_sft_nonconv_nogroup_resampler256_only_input/epoch_1/'} +-------------------------------------------------------------------------------- +[2023-12-25 19:53:16,490] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect) +Imported class: +The current model version is configured for Otter-Image with max_num_frames set to None. +Parameter: lang_encoder.model.embed_tokens.weight, Size: 131.084288 M +Parameter: lang_encoder.model.layers.0.gated_cross_attn_layer.attn_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.0.gated_cross_attn_layer.ff_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.0.gated_cross_attn_layer.attn.norm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.0.gated_cross_attn_layer.attn.norm.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.0.gated_cross_attn_layer.attn.to_q.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.0.gated_cross_attn_layer.attn.to_kv.weight, Size: 1.048576 M +Parameter: lang_encoder.model.layers.0.gated_cross_attn_layer.attn.to_out.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.0.gated_cross_attn_layer.feed_forward.0.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.0.gated_cross_attn_layer.feed_forward.0.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.0.gated_cross_attn_layer.feed_forward.1.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.0.gated_cross_attn_layer.feed_forward.3.weight, Size: 67.108864 M +Parameter: lang_encoder.lm_head.weight, Size: 131.084288 M +Parameter: perceiver.latents, Size: 0.065536 M +Parameter: perceiver.layers.0.norm_media.weight, Size: 0.001024 M +Parameter: perceiver.layers.0.norm_media.bias, Size: 0.001024 M +Parameter: perceiver.layers.0.norm_latents.weight, Size: 0.001024 M +Parameter: perceiver.layers.0.norm_latents.bias, Size: 0.001024 M +Parameter: perceiver.layers.0.to_q.weight, Size: 2.097152 M +Parameter: perceiver.layers.0.to_kv.weight, Size: 4.194304 M +Parameter: perceiver.layers.0.to_out.weight, Size: 2.097152 M +Parameter: perceiver.layers.0.feed_forward.0.weight, Size: 0.001024 M +Parameter: perceiver.layers.0.feed_forward.0.bias, Size: 0.001024 M +Parameter: perceiver.layers.0.feed_forward.1.weight, Size: 4.194304 M +Parameter: perceiver.layers.0.feed_forward.3.weight, Size: 4.194304 M +Parameter: perceiver.layers.1.norm_media.weight, Size: 0.001024 M +Parameter: perceiver.layers.1.norm_media.bias, Size: 0.001024 M +Parameter: perceiver.layers.1.norm_latents.weight, Size: 0.001024 M +Parameter: perceiver.layers.1.norm_latents.bias, Size: 0.001024 M +Parameter: perceiver.layers.1.to_q.weight, Size: 2.097152 M +Parameter: perceiver.layers.1.to_kv.weight, Size: 4.194304 M +Parameter: perceiver.layers.1.to_out.weight, Size: 2.097152 M +Parameter: perceiver.layers.1.feed_forward.0.weight, Size: 0.001024 M +Parameter: perceiver.layers.1.feed_forward.0.bias, Size: 0.001024 M +Parameter: perceiver.layers.1.feed_forward.1.weight, Size: 4.194304 M +Parameter: perceiver.layers.1.feed_forward.3.weight, Size: 4.194304 M +Parameter: perceiver.layers.2.norm_media.weight, Size: 0.001024 M +Parameter: perceiver.layers.2.norm_media.bias, Size: 0.001024 M +Parameter: perceiver.layers.2.norm_latents.weight, Size: 0.001024 M +Parameter: perceiver.layers.2.norm_latents.bias, Size: 0.001024 M +Parameter: perceiver.layers.2.to_q.weight, Size: 2.097152 M +Parameter: perceiver.layers.2.to_kv.weight, Size: 4.194304 M +Parameter: perceiver.layers.2.to_out.weight, Size: 2.097152 M +Parameter: perceiver.layers.2.feed_forward.0.weight, Size: 0.001024 M +Parameter: perceiver.layers.2.feed_forward.0.bias, Size: 0.001024 M +Parameter: perceiver.layers.2.feed_forward.1.weight, Size: 4.194304 M +Parameter: perceiver.layers.2.feed_forward.3.weight, Size: 4.194304 M +Parameter: perceiver.layers.3.norm_media.weight, Size: 0.001024 M +Parameter: perceiver.layers.3.norm_media.bias, Size: 0.001024 M +Parameter: perceiver.layers.3.norm_latents.weight, Size: 0.001024 M +Parameter: perceiver.layers.3.norm_latents.bias, Size: 0.001024 M +Parameter: perceiver.layers.3.to_q.weight, Size: 2.097152 M +Parameter: perceiver.layers.3.to_kv.weight, Size: 4.194304 M +Parameter: perceiver.layers.3.to_out.weight, Size: 2.097152 M +Parameter: perceiver.layers.3.feed_forward.0.weight, Size: 0.001024 M +Parameter: perceiver.layers.3.feed_forward.0.bias, Size: 0.001024 M +Parameter: perceiver.layers.3.feed_forward.1.weight, Size: 4.194304 M +Parameter: perceiver.layers.3.feed_forward.3.weight, Size: 4.194304 M +Parameter: perceiver.layers.4.norm_media.weight, Size: 0.001024 M +Parameter: perceiver.layers.4.norm_media.bias, Size: 0.001024 M +Parameter: perceiver.layers.4.norm_latents.weight, Size: 0.001024 M +Parameter: perceiver.layers.4.norm_latents.bias, Size: 0.001024 M +Parameter: perceiver.layers.4.to_q.weight, Size: 2.097152 M +Parameter: perceiver.layers.4.to_kv.weight, Size: 4.194304 M +Parameter: perceiver.layers.4.to_out.weight, Size: 2.097152 M +Parameter: perceiver.layers.4.feed_forward.0.weight, Size: 0.001024 M +Parameter: perceiver.layers.4.feed_forward.0.bias, Size: 0.001024 M +Parameter: perceiver.layers.4.feed_forward.1.weight, Size: 4.194304 M +Parameter: perceiver.layers.4.feed_forward.3.weight, Size: 4.194304 M +Parameter: perceiver.layers.5.norm_media.weight, Size: 0.001024 M +Parameter: perceiver.layers.5.norm_media.bias, Size: 0.001024 M +Parameter: perceiver.layers.5.norm_latents.weight, Size: 0.001024 M +Parameter: perceiver.layers.5.norm_latents.bias, Size: 0.001024 M +Parameter: perceiver.layers.5.to_q.weight, Size: 2.097152 M +Parameter: perceiver.layers.5.to_kv.weight, Size: 4.194304 M +Parameter: perceiver.layers.5.to_out.weight, Size: 2.097152 M +Parameter: perceiver.layers.5.feed_forward.0.weight, Size: 0.001024 M +Parameter: perceiver.layers.5.feed_forward.0.bias, Size: 0.001024 M +Parameter: perceiver.layers.5.feed_forward.1.weight, Size: 4.194304 M +Parameter: perceiver.layers.5.feed_forward.3.weight, Size: 4.194304 M +Parameter: perceiver.norm.weight, Size: 0.001024 M +Parameter: perceiver.norm.bias, Size: 0.001024 M +Total Trainable param: 0.502413 B +Imported class: + +DATASET: MMEDataset +-------------------- +=========== Cognition =========== +total score: 0.0 + code_reasoning score: 0.0 + numerical_calculation score: 0.0 + text_translation score: 0.0 + commonsense_reasoning score: 0.0 +=========== Perception =========== diff --git a/eval_results/eval_results_otter_pretrainlr1e-3 b/eval_results/eval_results_otter_pretrainlr1e-3 new file mode 100644 index 00000000..cea78bcb --- /dev/null +++ b/eval_results/eval_results_otter_pretrainlr1e-3 @@ -0,0 +1,197 @@ +================================================================================ + EVALUATION REPORT +================================================================================ + + +MODEL INFO: {'name': 'otter_image', 'model_path': '/mnt/petrelfs/zhangyuanhan/Otter/checkpoints/otter_llava_lr1e-3_sft_nonconv_nogroup/epoch_1/'} +-------------------------------------------------------------------------------- +[2023-12-23 22:05:42,500] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect) +Imported class: +The current model version is configured for Otter-Image with max_num_frames set to None. +Parameter: lang_encoder.model.embed_tokens.weight, Size: 131.084288 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.attn_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.ff_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.attn.norm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.attn.norm.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.attn.to_q.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.attn.to_kv.weight, Size: 1.048576 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.attn.to_out.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.feed_forward.0.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.feed_forward.0.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.feed_forward.1.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.feed_forward.3.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.attn_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.ff_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.attn.norm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.attn.norm.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.attn.to_q.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.attn.to_kv.weight, Size: 1.048576 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.attn.to_out.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.feed_forward.0.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.feed_forward.0.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.feed_forward.1.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.feed_forward.3.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.attn_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.ff_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.attn.norm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.attn.norm.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.attn.to_q.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.attn.to_kv.weight, Size: 1.048576 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.attn.to_out.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.feed_forward.0.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.feed_forward.0.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.feed_forward.1.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.feed_forward.3.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.attn_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.ff_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.attn.norm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.attn.norm.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.attn.to_q.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.attn.to_kv.weight, Size: 1.048576 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.attn.to_out.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.feed_forward.0.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.feed_forward.0.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.feed_forward.1.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.feed_forward.3.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.attn_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.ff_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.attn.norm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.attn.norm.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.attn.to_q.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.attn.to_kv.weight, Size: 1.048576 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.attn.to_out.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.feed_forward.0.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.feed_forward.0.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.feed_forward.1.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.feed_forward.3.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.attn_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.ff_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.attn.norm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.attn.norm.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.attn.to_q.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.attn.to_kv.weight, Size: 1.048576 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.attn.to_out.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.feed_forward.0.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.feed_forward.0.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.feed_forward.1.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.feed_forward.3.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.attn_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.ff_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.attn.norm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.attn.norm.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.attn.to_q.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.attn.to_kv.weight, Size: 1.048576 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.attn.to_out.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.feed_forward.0.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.feed_forward.0.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.feed_forward.1.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.feed_forward.3.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.attn_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.ff_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.attn.norm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.attn.norm.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.attn.to_q.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.attn.to_kv.weight, Size: 1.048576 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.attn.to_out.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.feed_forward.0.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.feed_forward.0.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.feed_forward.1.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.feed_forward.3.weight, Size: 67.108864 M +Parameter: lang_encoder.lm_head.weight, Size: 131.084288 M +Parameter: perceiver.latents, Size: 0.065536 M +Parameter: perceiver.layers.0.norm_media.weight, Size: 0.001024 M +Parameter: perceiver.layers.0.norm_media.bias, Size: 0.001024 M +Parameter: perceiver.layers.0.norm_latents.weight, Size: 0.001024 M +Parameter: perceiver.layers.0.norm_latents.bias, Size: 0.001024 M +Parameter: perceiver.layers.0.to_q.weight, Size: 0.524288 M +Parameter: perceiver.layers.0.to_kv.weight, Size: 1.048576 M +Parameter: perceiver.layers.0.to_out.weight, Size: 0.524288 M +Parameter: perceiver.layers.0.feed_forward.0.weight, Size: 0.001024 M +Parameter: perceiver.layers.0.feed_forward.0.bias, Size: 0.001024 M +Parameter: perceiver.layers.0.feed_forward.1.weight, Size: 4.194304 M +Parameter: perceiver.layers.0.feed_forward.3.weight, Size: 4.194304 M +Parameter: perceiver.layers.1.norm_media.weight, Size: 0.001024 M +Parameter: perceiver.layers.1.norm_media.bias, Size: 0.001024 M +Parameter: perceiver.layers.1.norm_latents.weight, Size: 0.001024 M +Parameter: perceiver.layers.1.norm_latents.bias, Size: 0.001024 M +Parameter: perceiver.layers.1.to_q.weight, Size: 0.524288 M +Parameter: perceiver.layers.1.to_kv.weight, Size: 1.048576 M +Parameter: perceiver.layers.1.to_out.weight, Size: 0.524288 M +Parameter: perceiver.layers.1.feed_forward.0.weight, Size: 0.001024 M +Parameter: perceiver.layers.1.feed_forward.0.bias, Size: 0.001024 M +Parameter: perceiver.layers.1.feed_forward.1.weight, Size: 4.194304 M +Parameter: perceiver.layers.1.feed_forward.3.weight, Size: 4.194304 M +Parameter: perceiver.layers.2.norm_media.weight, Size: 0.001024 M +Parameter: perceiver.layers.2.norm_media.bias, Size: 0.001024 M +Parameter: perceiver.layers.2.norm_latents.weight, Size: 0.001024 M +Parameter: perceiver.layers.2.norm_latents.bias, Size: 0.001024 M +Parameter: perceiver.layers.2.to_q.weight, Size: 0.524288 M +Parameter: perceiver.layers.2.to_kv.weight, Size: 1.048576 M +Parameter: perceiver.layers.2.to_out.weight, Size: 0.524288 M +Parameter: perceiver.layers.2.feed_forward.0.weight, Size: 0.001024 M +Parameter: perceiver.layers.2.feed_forward.0.bias, Size: 0.001024 M +Parameter: perceiver.layers.2.feed_forward.1.weight, Size: 4.194304 M +Parameter: perceiver.layers.2.feed_forward.3.weight, Size: 4.194304 M +Parameter: perceiver.layers.3.norm_media.weight, Size: 0.001024 M +Parameter: perceiver.layers.3.norm_media.bias, Size: 0.001024 M +Parameter: perceiver.layers.3.norm_latents.weight, Size: 0.001024 M +Parameter: perceiver.layers.3.norm_latents.bias, Size: 0.001024 M +Parameter: perceiver.layers.3.to_q.weight, Size: 0.524288 M +Parameter: perceiver.layers.3.to_kv.weight, Size: 1.048576 M +Parameter: perceiver.layers.3.to_out.weight, Size: 0.524288 M +Parameter: perceiver.layers.3.feed_forward.0.weight, Size: 0.001024 M +Parameter: perceiver.layers.3.feed_forward.0.bias, Size: 0.001024 M +Parameter: perceiver.layers.3.feed_forward.1.weight, Size: 4.194304 M +Parameter: perceiver.layers.3.feed_forward.3.weight, Size: 4.194304 M +Parameter: perceiver.layers.4.norm_media.weight, Size: 0.001024 M +Parameter: perceiver.layers.4.norm_media.bias, Size: 0.001024 M +Parameter: perceiver.layers.4.norm_latents.weight, Size: 0.001024 M +Parameter: perceiver.layers.4.norm_latents.bias, Size: 0.001024 M +Parameter: perceiver.layers.4.to_q.weight, Size: 0.524288 M +Parameter: perceiver.layers.4.to_kv.weight, Size: 1.048576 M +Parameter: perceiver.layers.4.to_out.weight, Size: 0.524288 M +Parameter: perceiver.layers.4.feed_forward.0.weight, Size: 0.001024 M +Parameter: perceiver.layers.4.feed_forward.0.bias, Size: 0.001024 M +Parameter: perceiver.layers.4.feed_forward.1.weight, Size: 4.194304 M +Parameter: perceiver.layers.4.feed_forward.3.weight, Size: 4.194304 M +Parameter: perceiver.layers.5.norm_media.weight, Size: 0.001024 M +Parameter: perceiver.layers.5.norm_media.bias, Size: 0.001024 M +Parameter: perceiver.layers.5.norm_latents.weight, Size: 0.001024 M +Parameter: perceiver.layers.5.norm_latents.bias, Size: 0.001024 M +Parameter: perceiver.layers.5.to_q.weight, Size: 0.524288 M +Parameter: perceiver.layers.5.to_kv.weight, Size: 1.048576 M +Parameter: perceiver.layers.5.to_out.weight, Size: 0.524288 M +Parameter: perceiver.layers.5.feed_forward.0.weight, Size: 0.001024 M +Parameter: perceiver.layers.5.feed_forward.0.bias, Size: 0.001024 M +Parameter: perceiver.layers.5.feed_forward.1.weight, Size: 4.194304 M +Parameter: perceiver.layers.5.feed_forward.3.weight, Size: 4.194304 M +Parameter: perceiver.norm.weight, Size: 0.001024 M +Parameter: perceiver.norm.bias, Size: 0.001024 M +Total Trainable param: 1.441004 B +Imported class: + +DATASET: MMEDataset +-------------------- +=========== Cognition =========== +total score: 266.42857142857144 + code_reasoning score: 50.0 + numerical_calculation score: 50.0 + text_translation score: 100.0 + commonsense_reasoning score: 66.42857142857143 +=========== Perception =========== +total score: 529.3248299319728 + artwork score: 70.5 + celebrity score: 50.0 + count score: 50.0 + color score: 50.0 + position score: 50.0 + OCR score: 50.0 + landmark score: 54.24999999999999 + scene score: 62.5 + existence score: 55.00000000000001 + posters score: 37.07482993197279 + +-------------------------------------------------------------------------------- +Total Datasets Evaluated: 1 + +================================================================================ diff --git a/eval_results/eval_results_otter_train_lang_encoder b/eval_results/eval_results_otter_train_lang_encoder new file mode 100644 index 00000000..a194201f --- /dev/null +++ b/eval_results/eval_results_otter_train_lang_encoder @@ -0,0 +1,487 @@ +================================================================================ + EVALUATION REPORT +================================================================================ + + +MODEL INFO: {'name': 'otter_image', 'model_path': '/mnt/petrelfs/zhangyuanhan/Otter/checkpoints/otter_llava_sft_nonconv_nogroup_train_lang_encoder/epoch_1/'} +-------------------------------------------------------------------------------- +[2023-12-23 13:32:41,224] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect) +Imported class: +The current model version is configured for Otter-Image with max_num_frames set to None. +Unfreeze language decoder. +Parameter: lang_encoder.model.embed_tokens.weight, Size: 131.084288 M +Parameter: lang_encoder.model.layers.0.decoder_layer.self_attn.q_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.0.decoder_layer.self_attn.k_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.0.decoder_layer.self_attn.v_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.0.decoder_layer.self_attn.o_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.0.decoder_layer.mlp.gate_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.0.decoder_layer.mlp.up_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.0.decoder_layer.mlp.down_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.0.decoder_layer.input_layernorm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.0.decoder_layer.post_attention_layernorm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.1.decoder_layer.self_attn.q_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.1.decoder_layer.self_attn.k_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.1.decoder_layer.self_attn.v_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.1.decoder_layer.self_attn.o_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.1.decoder_layer.mlp.gate_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.1.decoder_layer.mlp.up_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.1.decoder_layer.mlp.down_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.1.decoder_layer.input_layernorm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.1.decoder_layer.post_attention_layernorm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.2.decoder_layer.self_attn.q_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.2.decoder_layer.self_attn.k_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.2.decoder_layer.self_attn.v_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.2.decoder_layer.self_attn.o_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.2.decoder_layer.mlp.gate_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.2.decoder_layer.mlp.up_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.2.decoder_layer.mlp.down_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.2.decoder_layer.input_layernorm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.2.decoder_layer.post_attention_layernorm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.attn_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.ff_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.attn.norm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.attn.norm.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.attn.to_q.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.attn.to_kv.weight, Size: 1.048576 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.attn.to_out.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.feed_forward.0.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.feed_forward.0.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.feed_forward.1.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.feed_forward.3.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.3.decoder_layer.self_attn.q_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.3.decoder_layer.self_attn.k_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.3.decoder_layer.self_attn.v_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.3.decoder_layer.self_attn.o_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.3.decoder_layer.mlp.gate_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.3.decoder_layer.mlp.up_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.3.decoder_layer.mlp.down_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.3.decoder_layer.input_layernorm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.3.decoder_layer.post_attention_layernorm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.4.decoder_layer.self_attn.q_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.4.decoder_layer.self_attn.k_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.4.decoder_layer.self_attn.v_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.4.decoder_layer.self_attn.o_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.4.decoder_layer.mlp.gate_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.4.decoder_layer.mlp.up_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.4.decoder_layer.mlp.down_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.4.decoder_layer.input_layernorm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.4.decoder_layer.post_attention_layernorm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.5.decoder_layer.self_attn.q_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.5.decoder_layer.self_attn.k_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.5.decoder_layer.self_attn.v_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.5.decoder_layer.self_attn.o_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.5.decoder_layer.mlp.gate_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.5.decoder_layer.mlp.up_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.5.decoder_layer.mlp.down_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.5.decoder_layer.input_layernorm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.5.decoder_layer.post_attention_layernorm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.6.decoder_layer.self_attn.q_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.6.decoder_layer.self_attn.k_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.6.decoder_layer.self_attn.v_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.6.decoder_layer.self_attn.o_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.6.decoder_layer.mlp.gate_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.6.decoder_layer.mlp.up_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.6.decoder_layer.mlp.down_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.6.decoder_layer.input_layernorm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.6.decoder_layer.post_attention_layernorm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.attn_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.ff_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.attn.norm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.attn.norm.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.attn.to_q.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.attn.to_kv.weight, Size: 1.048576 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.attn.to_out.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.feed_forward.0.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.feed_forward.0.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.feed_forward.1.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.feed_forward.3.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.7.decoder_layer.self_attn.q_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.7.decoder_layer.self_attn.k_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.7.decoder_layer.self_attn.v_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.7.decoder_layer.self_attn.o_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.7.decoder_layer.mlp.gate_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.7.decoder_layer.mlp.up_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.7.decoder_layer.mlp.down_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.7.decoder_layer.input_layernorm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.7.decoder_layer.post_attention_layernorm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.8.decoder_layer.self_attn.q_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.8.decoder_layer.self_attn.k_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.8.decoder_layer.self_attn.v_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.8.decoder_layer.self_attn.o_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.8.decoder_layer.mlp.gate_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.8.decoder_layer.mlp.up_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.8.decoder_layer.mlp.down_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.8.decoder_layer.input_layernorm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.8.decoder_layer.post_attention_layernorm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.9.decoder_layer.self_attn.q_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.9.decoder_layer.self_attn.k_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.9.decoder_layer.self_attn.v_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.9.decoder_layer.self_attn.o_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.9.decoder_layer.mlp.gate_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.9.decoder_layer.mlp.up_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.9.decoder_layer.mlp.down_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.9.decoder_layer.input_layernorm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.9.decoder_layer.post_attention_layernorm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.10.decoder_layer.self_attn.q_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.10.decoder_layer.self_attn.k_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.10.decoder_layer.self_attn.v_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.10.decoder_layer.self_attn.o_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.10.decoder_layer.mlp.gate_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.10.decoder_layer.mlp.up_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.10.decoder_layer.mlp.down_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.10.decoder_layer.input_layernorm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.10.decoder_layer.post_attention_layernorm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.attn_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.ff_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.attn.norm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.attn.norm.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.attn.to_q.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.attn.to_kv.weight, Size: 1.048576 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.attn.to_out.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.feed_forward.0.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.feed_forward.0.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.feed_forward.1.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.feed_forward.3.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.11.decoder_layer.self_attn.q_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.11.decoder_layer.self_attn.k_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.11.decoder_layer.self_attn.v_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.11.decoder_layer.self_attn.o_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.11.decoder_layer.mlp.gate_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.11.decoder_layer.mlp.up_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.11.decoder_layer.mlp.down_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.11.decoder_layer.input_layernorm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.11.decoder_layer.post_attention_layernorm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.12.decoder_layer.self_attn.q_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.12.decoder_layer.self_attn.k_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.12.decoder_layer.self_attn.v_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.12.decoder_layer.self_attn.o_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.12.decoder_layer.mlp.gate_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.12.decoder_layer.mlp.up_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.12.decoder_layer.mlp.down_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.12.decoder_layer.input_layernorm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.12.decoder_layer.post_attention_layernorm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.13.decoder_layer.self_attn.q_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.13.decoder_layer.self_attn.k_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.13.decoder_layer.self_attn.v_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.13.decoder_layer.self_attn.o_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.13.decoder_layer.mlp.gate_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.13.decoder_layer.mlp.up_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.13.decoder_layer.mlp.down_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.13.decoder_layer.input_layernorm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.13.decoder_layer.post_attention_layernorm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.14.decoder_layer.self_attn.q_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.14.decoder_layer.self_attn.k_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.14.decoder_layer.self_attn.v_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.14.decoder_layer.self_attn.o_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.14.decoder_layer.mlp.gate_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.14.decoder_layer.mlp.up_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.14.decoder_layer.mlp.down_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.14.decoder_layer.input_layernorm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.14.decoder_layer.post_attention_layernorm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.attn_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.ff_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.attn.norm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.attn.norm.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.attn.to_q.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.attn.to_kv.weight, Size: 1.048576 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.attn.to_out.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.feed_forward.0.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.feed_forward.0.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.feed_forward.1.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.feed_forward.3.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.15.decoder_layer.self_attn.q_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.15.decoder_layer.self_attn.k_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.15.decoder_layer.self_attn.v_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.15.decoder_layer.self_attn.o_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.15.decoder_layer.mlp.gate_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.15.decoder_layer.mlp.up_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.15.decoder_layer.mlp.down_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.15.decoder_layer.input_layernorm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.15.decoder_layer.post_attention_layernorm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.16.decoder_layer.self_attn.q_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.16.decoder_layer.self_attn.k_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.16.decoder_layer.self_attn.v_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.16.decoder_layer.self_attn.o_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.16.decoder_layer.mlp.gate_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.16.decoder_layer.mlp.up_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.16.decoder_layer.mlp.down_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.16.decoder_layer.input_layernorm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.16.decoder_layer.post_attention_layernorm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.17.decoder_layer.self_attn.q_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.17.decoder_layer.self_attn.k_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.17.decoder_layer.self_attn.v_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.17.decoder_layer.self_attn.o_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.17.decoder_layer.mlp.gate_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.17.decoder_layer.mlp.up_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.17.decoder_layer.mlp.down_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.17.decoder_layer.input_layernorm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.17.decoder_layer.post_attention_layernorm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.18.decoder_layer.self_attn.q_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.18.decoder_layer.self_attn.k_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.18.decoder_layer.self_attn.v_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.18.decoder_layer.self_attn.o_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.18.decoder_layer.mlp.gate_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.18.decoder_layer.mlp.up_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.18.decoder_layer.mlp.down_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.18.decoder_layer.input_layernorm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.18.decoder_layer.post_attention_layernorm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.attn_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.ff_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.attn.norm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.attn.norm.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.attn.to_q.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.attn.to_kv.weight, Size: 1.048576 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.attn.to_out.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.feed_forward.0.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.feed_forward.0.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.feed_forward.1.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.feed_forward.3.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.19.decoder_layer.self_attn.q_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.19.decoder_layer.self_attn.k_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.19.decoder_layer.self_attn.v_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.19.decoder_layer.self_attn.o_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.19.decoder_layer.mlp.gate_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.19.decoder_layer.mlp.up_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.19.decoder_layer.mlp.down_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.19.decoder_layer.input_layernorm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.19.decoder_layer.post_attention_layernorm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.20.decoder_layer.self_attn.q_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.20.decoder_layer.self_attn.k_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.20.decoder_layer.self_attn.v_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.20.decoder_layer.self_attn.o_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.20.decoder_layer.mlp.gate_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.20.decoder_layer.mlp.up_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.20.decoder_layer.mlp.down_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.20.decoder_layer.input_layernorm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.20.decoder_layer.post_attention_layernorm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.21.decoder_layer.self_attn.q_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.21.decoder_layer.self_attn.k_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.21.decoder_layer.self_attn.v_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.21.decoder_layer.self_attn.o_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.21.decoder_layer.mlp.gate_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.21.decoder_layer.mlp.up_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.21.decoder_layer.mlp.down_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.21.decoder_layer.input_layernorm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.21.decoder_layer.post_attention_layernorm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.22.decoder_layer.self_attn.q_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.22.decoder_layer.self_attn.k_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.22.decoder_layer.self_attn.v_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.22.decoder_layer.self_attn.o_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.22.decoder_layer.mlp.gate_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.22.decoder_layer.mlp.up_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.22.decoder_layer.mlp.down_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.22.decoder_layer.input_layernorm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.22.decoder_layer.post_attention_layernorm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.attn_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.ff_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.attn.norm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.attn.norm.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.attn.to_q.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.attn.to_kv.weight, Size: 1.048576 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.attn.to_out.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.feed_forward.0.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.feed_forward.0.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.feed_forward.1.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.feed_forward.3.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.23.decoder_layer.self_attn.q_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.23.decoder_layer.self_attn.k_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.23.decoder_layer.self_attn.v_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.23.decoder_layer.self_attn.o_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.23.decoder_layer.mlp.gate_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.23.decoder_layer.mlp.up_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.23.decoder_layer.mlp.down_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.23.decoder_layer.input_layernorm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.23.decoder_layer.post_attention_layernorm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.24.decoder_layer.self_attn.q_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.24.decoder_layer.self_attn.k_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.24.decoder_layer.self_attn.v_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.24.decoder_layer.self_attn.o_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.24.decoder_layer.mlp.gate_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.24.decoder_layer.mlp.up_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.24.decoder_layer.mlp.down_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.24.decoder_layer.input_layernorm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.24.decoder_layer.post_attention_layernorm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.25.decoder_layer.self_attn.q_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.25.decoder_layer.self_attn.k_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.25.decoder_layer.self_attn.v_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.25.decoder_layer.self_attn.o_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.25.decoder_layer.mlp.gate_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.25.decoder_layer.mlp.up_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.25.decoder_layer.mlp.down_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.25.decoder_layer.input_layernorm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.25.decoder_layer.post_attention_layernorm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.26.decoder_layer.self_attn.q_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.26.decoder_layer.self_attn.k_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.26.decoder_layer.self_attn.v_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.26.decoder_layer.self_attn.o_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.26.decoder_layer.mlp.gate_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.26.decoder_layer.mlp.up_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.26.decoder_layer.mlp.down_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.26.decoder_layer.input_layernorm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.26.decoder_layer.post_attention_layernorm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.attn_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.ff_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.attn.norm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.attn.norm.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.attn.to_q.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.attn.to_kv.weight, Size: 1.048576 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.attn.to_out.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.feed_forward.0.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.feed_forward.0.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.feed_forward.1.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.feed_forward.3.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.27.decoder_layer.self_attn.q_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.27.decoder_layer.self_attn.k_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.27.decoder_layer.self_attn.v_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.27.decoder_layer.self_attn.o_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.27.decoder_layer.mlp.gate_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.27.decoder_layer.mlp.up_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.27.decoder_layer.mlp.down_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.27.decoder_layer.input_layernorm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.27.decoder_layer.post_attention_layernorm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.28.decoder_layer.self_attn.q_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.28.decoder_layer.self_attn.k_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.28.decoder_layer.self_attn.v_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.28.decoder_layer.self_attn.o_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.28.decoder_layer.mlp.gate_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.28.decoder_layer.mlp.up_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.28.decoder_layer.mlp.down_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.28.decoder_layer.input_layernorm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.28.decoder_layer.post_attention_layernorm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.29.decoder_layer.self_attn.q_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.29.decoder_layer.self_attn.k_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.29.decoder_layer.self_attn.v_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.29.decoder_layer.self_attn.o_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.29.decoder_layer.mlp.gate_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.29.decoder_layer.mlp.up_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.29.decoder_layer.mlp.down_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.29.decoder_layer.input_layernorm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.29.decoder_layer.post_attention_layernorm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.30.decoder_layer.self_attn.q_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.30.decoder_layer.self_attn.k_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.30.decoder_layer.self_attn.v_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.30.decoder_layer.self_attn.o_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.30.decoder_layer.mlp.gate_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.30.decoder_layer.mlp.up_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.30.decoder_layer.mlp.down_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.30.decoder_layer.input_layernorm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.30.decoder_layer.post_attention_layernorm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.attn_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.ff_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.attn.norm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.attn.norm.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.attn.to_q.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.attn.to_kv.weight, Size: 1.048576 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.attn.to_out.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.feed_forward.0.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.feed_forward.0.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.feed_forward.1.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.feed_forward.3.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.31.decoder_layer.self_attn.q_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.31.decoder_layer.self_attn.k_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.31.decoder_layer.self_attn.v_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.31.decoder_layer.self_attn.o_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.31.decoder_layer.mlp.gate_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.31.decoder_layer.mlp.up_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.31.decoder_layer.mlp.down_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.31.decoder_layer.input_layernorm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.31.decoder_layer.post_attention_layernorm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.norm.weight, Size: 0.004096 M +Parameter: lang_encoder.lm_head.weight, Size: 131.084288 M +Parameter: perceiver.latents, Size: 0.065536 M +Parameter: perceiver.layers.0.norm_media.weight, Size: 0.001024 M +Parameter: perceiver.layers.0.norm_media.bias, Size: 0.001024 M +Parameter: perceiver.layers.0.norm_latents.weight, Size: 0.001024 M +Parameter: perceiver.layers.0.norm_latents.bias, Size: 0.001024 M +Parameter: perceiver.layers.0.to_q.weight, Size: 0.524288 M +Parameter: perceiver.layers.0.to_kv.weight, Size: 1.048576 M +Parameter: perceiver.layers.0.to_out.weight, Size: 0.524288 M +Parameter: perceiver.layers.0.feed_forward.0.weight, Size: 0.001024 M +Parameter: perceiver.layers.0.feed_forward.0.bias, Size: 0.001024 M +Parameter: perceiver.layers.0.feed_forward.1.weight, Size: 4.194304 M +Parameter: perceiver.layers.0.feed_forward.3.weight, Size: 4.194304 M +Parameter: perceiver.layers.1.norm_media.weight, Size: 0.001024 M +Parameter: perceiver.layers.1.norm_media.bias, Size: 0.001024 M +Parameter: perceiver.layers.1.norm_latents.weight, Size: 0.001024 M +Parameter: perceiver.layers.1.norm_latents.bias, Size: 0.001024 M +Parameter: perceiver.layers.1.to_q.weight, Size: 0.524288 M +Parameter: perceiver.layers.1.to_kv.weight, Size: 1.048576 M +Parameter: perceiver.layers.1.to_out.weight, Size: 0.524288 M +Parameter: perceiver.layers.1.feed_forward.0.weight, Size: 0.001024 M +Parameter: perceiver.layers.1.feed_forward.0.bias, Size: 0.001024 M +Parameter: perceiver.layers.1.feed_forward.1.weight, Size: 4.194304 M +Parameter: perceiver.layers.1.feed_forward.3.weight, Size: 4.194304 M +Parameter: perceiver.layers.2.norm_media.weight, Size: 0.001024 M +Parameter: perceiver.layers.2.norm_media.bias, Size: 0.001024 M +Parameter: perceiver.layers.2.norm_latents.weight, Size: 0.001024 M +Parameter: perceiver.layers.2.norm_latents.bias, Size: 0.001024 M +Parameter: perceiver.layers.2.to_q.weight, Size: 0.524288 M +Parameter: perceiver.layers.2.to_kv.weight, Size: 1.048576 M +Parameter: perceiver.layers.2.to_out.weight, Size: 0.524288 M +Parameter: perceiver.layers.2.feed_forward.0.weight, Size: 0.001024 M +Parameter: perceiver.layers.2.feed_forward.0.bias, Size: 0.001024 M +Parameter: perceiver.layers.2.feed_forward.1.weight, Size: 4.194304 M +Parameter: perceiver.layers.2.feed_forward.3.weight, Size: 4.194304 M +Parameter: perceiver.layers.3.norm_media.weight, Size: 0.001024 M +Parameter: perceiver.layers.3.norm_media.bias, Size: 0.001024 M +Parameter: perceiver.layers.3.norm_latents.weight, Size: 0.001024 M +Parameter: perceiver.layers.3.norm_latents.bias, Size: 0.001024 M +Parameter: perceiver.layers.3.to_q.weight, Size: 0.524288 M +Parameter: perceiver.layers.3.to_kv.weight, Size: 1.048576 M +Parameter: perceiver.layers.3.to_out.weight, Size: 0.524288 M +Parameter: perceiver.layers.3.feed_forward.0.weight, Size: 0.001024 M +Parameter: perceiver.layers.3.feed_forward.0.bias, Size: 0.001024 M +Parameter: perceiver.layers.3.feed_forward.1.weight, Size: 4.194304 M +Parameter: perceiver.layers.3.feed_forward.3.weight, Size: 4.194304 M +Parameter: perceiver.layers.4.norm_media.weight, Size: 0.001024 M +Parameter: perceiver.layers.4.norm_media.bias, Size: 0.001024 M +Parameter: perceiver.layers.4.norm_latents.weight, Size: 0.001024 M +Parameter: perceiver.layers.4.norm_latents.bias, Size: 0.001024 M +Parameter: perceiver.layers.4.to_q.weight, Size: 0.524288 M +Parameter: perceiver.layers.4.to_kv.weight, Size: 1.048576 M +Parameter: perceiver.layers.4.to_out.weight, Size: 0.524288 M +Parameter: perceiver.layers.4.feed_forward.0.weight, Size: 0.001024 M +Parameter: perceiver.layers.4.feed_forward.0.bias, Size: 0.001024 M +Parameter: perceiver.layers.4.feed_forward.1.weight, Size: 4.194304 M +Parameter: perceiver.layers.4.feed_forward.3.weight, Size: 4.194304 M +Parameter: perceiver.layers.5.norm_media.weight, Size: 0.001024 M +Parameter: perceiver.layers.5.norm_media.bias, Size: 0.001024 M +Parameter: perceiver.layers.5.norm_latents.weight, Size: 0.001024 M +Parameter: perceiver.layers.5.norm_latents.bias, Size: 0.001024 M +Parameter: perceiver.layers.5.to_q.weight, Size: 0.524288 M +Parameter: perceiver.layers.5.to_kv.weight, Size: 1.048576 M +Parameter: perceiver.layers.5.to_out.weight, Size: 0.524288 M +Parameter: perceiver.layers.5.feed_forward.0.weight, Size: 0.001024 M +Parameter: perceiver.layers.5.feed_forward.0.bias, Size: 0.001024 M +Parameter: perceiver.layers.5.feed_forward.1.weight, Size: 4.194304 M +Parameter: perceiver.layers.5.feed_forward.3.weight, Size: 4.194304 M +Parameter: perceiver.norm.weight, Size: 0.001024 M +Parameter: perceiver.norm.bias, Size: 0.001024 M +Total Trainable param: 7.917275 B +Imported class: + +DATASET: MMEDataset +-------------------- +=========== Cognition =========== +total score: 304.6428571428571 + code_reasoning score: 65.0 + numerical_calculation score: 50.0 + text_translation score: 97.5 + commonsense_reasoning score: 92.14285714285714 +=========== Perception =========== +total score: 995.9196678671468 + artwork score: 67.0 + celebrity score: 76.17647058823529 + count score: 96.66666666666666 + color score: 70.0 + position score: 56.666666666666664 + OCR score: 102.5 + landmark score: 113.75 + scene score: 138.5 + existence score: 175.0 + posters score: 99.65986394557824 + +-------------------------------------------------------------------------------- +Total Datasets Evaluated: 1 + +================================================================================ diff --git a/eval_results/eval_results_otter_train_lang_encoder_new b/eval_results/eval_results_otter_train_lang_encoder_new new file mode 100644 index 00000000..6d6b8813 --- /dev/null +++ b/eval_results/eval_results_otter_train_lang_encoder_new @@ -0,0 +1,487 @@ +================================================================================ + EVALUATION REPORT +================================================================================ + + +MODEL INFO: {'name': 'otter_image', 'model_path': '/mnt/petrelfs/zhangyuanhan/Otter/checkpoints/otter_llava_sft_nonconv_nogroup_train_lang_encoder/epoch_1/'} +-------------------------------------------------------------------------------- +[2023-12-25 21:27:09,679] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect) +Imported class: +The current model version is configured for Otter-Image with max_num_frames set to None. +Unfreeze language decoder. +Parameter: lang_encoder.model.embed_tokens.weight, Size: 131.084288 M +Parameter: lang_encoder.model.layers.0.decoder_layer.self_attn.q_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.0.decoder_layer.self_attn.k_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.0.decoder_layer.self_attn.v_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.0.decoder_layer.self_attn.o_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.0.decoder_layer.mlp.gate_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.0.decoder_layer.mlp.up_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.0.decoder_layer.mlp.down_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.0.decoder_layer.input_layernorm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.0.decoder_layer.post_attention_layernorm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.1.decoder_layer.self_attn.q_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.1.decoder_layer.self_attn.k_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.1.decoder_layer.self_attn.v_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.1.decoder_layer.self_attn.o_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.1.decoder_layer.mlp.gate_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.1.decoder_layer.mlp.up_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.1.decoder_layer.mlp.down_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.1.decoder_layer.input_layernorm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.1.decoder_layer.post_attention_layernorm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.2.decoder_layer.self_attn.q_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.2.decoder_layer.self_attn.k_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.2.decoder_layer.self_attn.v_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.2.decoder_layer.self_attn.o_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.2.decoder_layer.mlp.gate_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.2.decoder_layer.mlp.up_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.2.decoder_layer.mlp.down_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.2.decoder_layer.input_layernorm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.2.decoder_layer.post_attention_layernorm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.attn_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.ff_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.attn.norm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.attn.norm.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.attn.to_q.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.attn.to_kv.weight, Size: 1.048576 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.attn.to_out.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.feed_forward.0.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.feed_forward.0.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.feed_forward.1.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.feed_forward.3.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.3.decoder_layer.self_attn.q_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.3.decoder_layer.self_attn.k_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.3.decoder_layer.self_attn.v_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.3.decoder_layer.self_attn.o_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.3.decoder_layer.mlp.gate_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.3.decoder_layer.mlp.up_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.3.decoder_layer.mlp.down_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.3.decoder_layer.input_layernorm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.3.decoder_layer.post_attention_layernorm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.4.decoder_layer.self_attn.q_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.4.decoder_layer.self_attn.k_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.4.decoder_layer.self_attn.v_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.4.decoder_layer.self_attn.o_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.4.decoder_layer.mlp.gate_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.4.decoder_layer.mlp.up_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.4.decoder_layer.mlp.down_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.4.decoder_layer.input_layernorm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.4.decoder_layer.post_attention_layernorm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.5.decoder_layer.self_attn.q_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.5.decoder_layer.self_attn.k_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.5.decoder_layer.self_attn.v_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.5.decoder_layer.self_attn.o_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.5.decoder_layer.mlp.gate_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.5.decoder_layer.mlp.up_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.5.decoder_layer.mlp.down_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.5.decoder_layer.input_layernorm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.5.decoder_layer.post_attention_layernorm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.6.decoder_layer.self_attn.q_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.6.decoder_layer.self_attn.k_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.6.decoder_layer.self_attn.v_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.6.decoder_layer.self_attn.o_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.6.decoder_layer.mlp.gate_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.6.decoder_layer.mlp.up_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.6.decoder_layer.mlp.down_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.6.decoder_layer.input_layernorm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.6.decoder_layer.post_attention_layernorm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.attn_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.ff_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.attn.norm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.attn.norm.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.attn.to_q.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.attn.to_kv.weight, Size: 1.048576 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.attn.to_out.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.feed_forward.0.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.feed_forward.0.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.feed_forward.1.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.feed_forward.3.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.7.decoder_layer.self_attn.q_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.7.decoder_layer.self_attn.k_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.7.decoder_layer.self_attn.v_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.7.decoder_layer.self_attn.o_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.7.decoder_layer.mlp.gate_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.7.decoder_layer.mlp.up_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.7.decoder_layer.mlp.down_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.7.decoder_layer.input_layernorm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.7.decoder_layer.post_attention_layernorm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.8.decoder_layer.self_attn.q_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.8.decoder_layer.self_attn.k_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.8.decoder_layer.self_attn.v_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.8.decoder_layer.self_attn.o_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.8.decoder_layer.mlp.gate_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.8.decoder_layer.mlp.up_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.8.decoder_layer.mlp.down_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.8.decoder_layer.input_layernorm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.8.decoder_layer.post_attention_layernorm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.9.decoder_layer.self_attn.q_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.9.decoder_layer.self_attn.k_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.9.decoder_layer.self_attn.v_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.9.decoder_layer.self_attn.o_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.9.decoder_layer.mlp.gate_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.9.decoder_layer.mlp.up_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.9.decoder_layer.mlp.down_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.9.decoder_layer.input_layernorm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.9.decoder_layer.post_attention_layernorm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.10.decoder_layer.self_attn.q_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.10.decoder_layer.self_attn.k_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.10.decoder_layer.self_attn.v_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.10.decoder_layer.self_attn.o_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.10.decoder_layer.mlp.gate_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.10.decoder_layer.mlp.up_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.10.decoder_layer.mlp.down_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.10.decoder_layer.input_layernorm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.10.decoder_layer.post_attention_layernorm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.attn_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.ff_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.attn.norm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.attn.norm.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.attn.to_q.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.attn.to_kv.weight, Size: 1.048576 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.attn.to_out.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.feed_forward.0.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.feed_forward.0.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.feed_forward.1.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.feed_forward.3.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.11.decoder_layer.self_attn.q_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.11.decoder_layer.self_attn.k_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.11.decoder_layer.self_attn.v_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.11.decoder_layer.self_attn.o_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.11.decoder_layer.mlp.gate_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.11.decoder_layer.mlp.up_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.11.decoder_layer.mlp.down_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.11.decoder_layer.input_layernorm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.11.decoder_layer.post_attention_layernorm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.12.decoder_layer.self_attn.q_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.12.decoder_layer.self_attn.k_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.12.decoder_layer.self_attn.v_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.12.decoder_layer.self_attn.o_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.12.decoder_layer.mlp.gate_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.12.decoder_layer.mlp.up_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.12.decoder_layer.mlp.down_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.12.decoder_layer.input_layernorm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.12.decoder_layer.post_attention_layernorm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.13.decoder_layer.self_attn.q_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.13.decoder_layer.self_attn.k_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.13.decoder_layer.self_attn.v_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.13.decoder_layer.self_attn.o_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.13.decoder_layer.mlp.gate_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.13.decoder_layer.mlp.up_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.13.decoder_layer.mlp.down_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.13.decoder_layer.input_layernorm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.13.decoder_layer.post_attention_layernorm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.14.decoder_layer.self_attn.q_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.14.decoder_layer.self_attn.k_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.14.decoder_layer.self_attn.v_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.14.decoder_layer.self_attn.o_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.14.decoder_layer.mlp.gate_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.14.decoder_layer.mlp.up_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.14.decoder_layer.mlp.down_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.14.decoder_layer.input_layernorm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.14.decoder_layer.post_attention_layernorm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.attn_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.ff_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.attn.norm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.attn.norm.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.attn.to_q.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.attn.to_kv.weight, Size: 1.048576 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.attn.to_out.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.feed_forward.0.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.feed_forward.0.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.feed_forward.1.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.feed_forward.3.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.15.decoder_layer.self_attn.q_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.15.decoder_layer.self_attn.k_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.15.decoder_layer.self_attn.v_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.15.decoder_layer.self_attn.o_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.15.decoder_layer.mlp.gate_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.15.decoder_layer.mlp.up_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.15.decoder_layer.mlp.down_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.15.decoder_layer.input_layernorm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.15.decoder_layer.post_attention_layernorm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.16.decoder_layer.self_attn.q_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.16.decoder_layer.self_attn.k_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.16.decoder_layer.self_attn.v_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.16.decoder_layer.self_attn.o_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.16.decoder_layer.mlp.gate_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.16.decoder_layer.mlp.up_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.16.decoder_layer.mlp.down_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.16.decoder_layer.input_layernorm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.16.decoder_layer.post_attention_layernorm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.17.decoder_layer.self_attn.q_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.17.decoder_layer.self_attn.k_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.17.decoder_layer.self_attn.v_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.17.decoder_layer.self_attn.o_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.17.decoder_layer.mlp.gate_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.17.decoder_layer.mlp.up_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.17.decoder_layer.mlp.down_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.17.decoder_layer.input_layernorm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.17.decoder_layer.post_attention_layernorm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.18.decoder_layer.self_attn.q_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.18.decoder_layer.self_attn.k_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.18.decoder_layer.self_attn.v_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.18.decoder_layer.self_attn.o_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.18.decoder_layer.mlp.gate_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.18.decoder_layer.mlp.up_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.18.decoder_layer.mlp.down_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.18.decoder_layer.input_layernorm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.18.decoder_layer.post_attention_layernorm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.attn_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.ff_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.attn.norm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.attn.norm.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.attn.to_q.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.attn.to_kv.weight, Size: 1.048576 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.attn.to_out.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.feed_forward.0.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.feed_forward.0.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.feed_forward.1.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.feed_forward.3.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.19.decoder_layer.self_attn.q_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.19.decoder_layer.self_attn.k_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.19.decoder_layer.self_attn.v_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.19.decoder_layer.self_attn.o_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.19.decoder_layer.mlp.gate_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.19.decoder_layer.mlp.up_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.19.decoder_layer.mlp.down_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.19.decoder_layer.input_layernorm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.19.decoder_layer.post_attention_layernorm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.20.decoder_layer.self_attn.q_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.20.decoder_layer.self_attn.k_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.20.decoder_layer.self_attn.v_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.20.decoder_layer.self_attn.o_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.20.decoder_layer.mlp.gate_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.20.decoder_layer.mlp.up_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.20.decoder_layer.mlp.down_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.20.decoder_layer.input_layernorm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.20.decoder_layer.post_attention_layernorm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.21.decoder_layer.self_attn.q_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.21.decoder_layer.self_attn.k_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.21.decoder_layer.self_attn.v_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.21.decoder_layer.self_attn.o_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.21.decoder_layer.mlp.gate_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.21.decoder_layer.mlp.up_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.21.decoder_layer.mlp.down_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.21.decoder_layer.input_layernorm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.21.decoder_layer.post_attention_layernorm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.22.decoder_layer.self_attn.q_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.22.decoder_layer.self_attn.k_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.22.decoder_layer.self_attn.v_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.22.decoder_layer.self_attn.o_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.22.decoder_layer.mlp.gate_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.22.decoder_layer.mlp.up_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.22.decoder_layer.mlp.down_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.22.decoder_layer.input_layernorm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.22.decoder_layer.post_attention_layernorm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.attn_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.ff_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.attn.norm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.attn.norm.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.attn.to_q.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.attn.to_kv.weight, Size: 1.048576 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.attn.to_out.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.feed_forward.0.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.feed_forward.0.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.feed_forward.1.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.feed_forward.3.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.23.decoder_layer.self_attn.q_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.23.decoder_layer.self_attn.k_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.23.decoder_layer.self_attn.v_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.23.decoder_layer.self_attn.o_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.23.decoder_layer.mlp.gate_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.23.decoder_layer.mlp.up_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.23.decoder_layer.mlp.down_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.23.decoder_layer.input_layernorm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.23.decoder_layer.post_attention_layernorm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.24.decoder_layer.self_attn.q_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.24.decoder_layer.self_attn.k_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.24.decoder_layer.self_attn.v_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.24.decoder_layer.self_attn.o_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.24.decoder_layer.mlp.gate_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.24.decoder_layer.mlp.up_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.24.decoder_layer.mlp.down_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.24.decoder_layer.input_layernorm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.24.decoder_layer.post_attention_layernorm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.25.decoder_layer.self_attn.q_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.25.decoder_layer.self_attn.k_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.25.decoder_layer.self_attn.v_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.25.decoder_layer.self_attn.o_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.25.decoder_layer.mlp.gate_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.25.decoder_layer.mlp.up_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.25.decoder_layer.mlp.down_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.25.decoder_layer.input_layernorm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.25.decoder_layer.post_attention_layernorm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.26.decoder_layer.self_attn.q_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.26.decoder_layer.self_attn.k_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.26.decoder_layer.self_attn.v_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.26.decoder_layer.self_attn.o_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.26.decoder_layer.mlp.gate_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.26.decoder_layer.mlp.up_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.26.decoder_layer.mlp.down_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.26.decoder_layer.input_layernorm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.26.decoder_layer.post_attention_layernorm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.attn_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.ff_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.attn.norm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.attn.norm.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.attn.to_q.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.attn.to_kv.weight, Size: 1.048576 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.attn.to_out.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.feed_forward.0.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.feed_forward.0.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.feed_forward.1.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.feed_forward.3.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.27.decoder_layer.self_attn.q_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.27.decoder_layer.self_attn.k_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.27.decoder_layer.self_attn.v_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.27.decoder_layer.self_attn.o_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.27.decoder_layer.mlp.gate_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.27.decoder_layer.mlp.up_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.27.decoder_layer.mlp.down_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.27.decoder_layer.input_layernorm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.27.decoder_layer.post_attention_layernorm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.28.decoder_layer.self_attn.q_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.28.decoder_layer.self_attn.k_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.28.decoder_layer.self_attn.v_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.28.decoder_layer.self_attn.o_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.28.decoder_layer.mlp.gate_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.28.decoder_layer.mlp.up_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.28.decoder_layer.mlp.down_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.28.decoder_layer.input_layernorm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.28.decoder_layer.post_attention_layernorm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.29.decoder_layer.self_attn.q_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.29.decoder_layer.self_attn.k_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.29.decoder_layer.self_attn.v_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.29.decoder_layer.self_attn.o_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.29.decoder_layer.mlp.gate_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.29.decoder_layer.mlp.up_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.29.decoder_layer.mlp.down_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.29.decoder_layer.input_layernorm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.29.decoder_layer.post_attention_layernorm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.30.decoder_layer.self_attn.q_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.30.decoder_layer.self_attn.k_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.30.decoder_layer.self_attn.v_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.30.decoder_layer.self_attn.o_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.30.decoder_layer.mlp.gate_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.30.decoder_layer.mlp.up_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.30.decoder_layer.mlp.down_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.30.decoder_layer.input_layernorm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.30.decoder_layer.post_attention_layernorm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.attn_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.ff_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.attn.norm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.attn.norm.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.attn.to_q.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.attn.to_kv.weight, Size: 1.048576 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.attn.to_out.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.feed_forward.0.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.feed_forward.0.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.feed_forward.1.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.feed_forward.3.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.31.decoder_layer.self_attn.q_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.31.decoder_layer.self_attn.k_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.31.decoder_layer.self_attn.v_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.31.decoder_layer.self_attn.o_proj.weight, Size: 16.777216 M +Parameter: lang_encoder.model.layers.31.decoder_layer.mlp.gate_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.31.decoder_layer.mlp.up_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.31.decoder_layer.mlp.down_proj.weight, Size: 45.088768 M +Parameter: lang_encoder.model.layers.31.decoder_layer.input_layernorm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.31.decoder_layer.post_attention_layernorm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.norm.weight, Size: 0.004096 M +Parameter: lang_encoder.lm_head.weight, Size: 131.084288 M +Parameter: perceiver.latents, Size: 0.065536 M +Parameter: perceiver.layers.0.norm_media.weight, Size: 0.001024 M +Parameter: perceiver.layers.0.norm_media.bias, Size: 0.001024 M +Parameter: perceiver.layers.0.norm_latents.weight, Size: 0.001024 M +Parameter: perceiver.layers.0.norm_latents.bias, Size: 0.001024 M +Parameter: perceiver.layers.0.to_q.weight, Size: 0.524288 M +Parameter: perceiver.layers.0.to_kv.weight, Size: 1.048576 M +Parameter: perceiver.layers.0.to_out.weight, Size: 0.524288 M +Parameter: perceiver.layers.0.feed_forward.0.weight, Size: 0.001024 M +Parameter: perceiver.layers.0.feed_forward.0.bias, Size: 0.001024 M +Parameter: perceiver.layers.0.feed_forward.1.weight, Size: 4.194304 M +Parameter: perceiver.layers.0.feed_forward.3.weight, Size: 4.194304 M +Parameter: perceiver.layers.1.norm_media.weight, Size: 0.001024 M +Parameter: perceiver.layers.1.norm_media.bias, Size: 0.001024 M +Parameter: perceiver.layers.1.norm_latents.weight, Size: 0.001024 M +Parameter: perceiver.layers.1.norm_latents.bias, Size: 0.001024 M +Parameter: perceiver.layers.1.to_q.weight, Size: 0.524288 M +Parameter: perceiver.layers.1.to_kv.weight, Size: 1.048576 M +Parameter: perceiver.layers.1.to_out.weight, Size: 0.524288 M +Parameter: perceiver.layers.1.feed_forward.0.weight, Size: 0.001024 M +Parameter: perceiver.layers.1.feed_forward.0.bias, Size: 0.001024 M +Parameter: perceiver.layers.1.feed_forward.1.weight, Size: 4.194304 M +Parameter: perceiver.layers.1.feed_forward.3.weight, Size: 4.194304 M +Parameter: perceiver.layers.2.norm_media.weight, Size: 0.001024 M +Parameter: perceiver.layers.2.norm_media.bias, Size: 0.001024 M +Parameter: perceiver.layers.2.norm_latents.weight, Size: 0.001024 M +Parameter: perceiver.layers.2.norm_latents.bias, Size: 0.001024 M +Parameter: perceiver.layers.2.to_q.weight, Size: 0.524288 M +Parameter: perceiver.layers.2.to_kv.weight, Size: 1.048576 M +Parameter: perceiver.layers.2.to_out.weight, Size: 0.524288 M +Parameter: perceiver.layers.2.feed_forward.0.weight, Size: 0.001024 M +Parameter: perceiver.layers.2.feed_forward.0.bias, Size: 0.001024 M +Parameter: perceiver.layers.2.feed_forward.1.weight, Size: 4.194304 M +Parameter: perceiver.layers.2.feed_forward.3.weight, Size: 4.194304 M +Parameter: perceiver.layers.3.norm_media.weight, Size: 0.001024 M +Parameter: perceiver.layers.3.norm_media.bias, Size: 0.001024 M +Parameter: perceiver.layers.3.norm_latents.weight, Size: 0.001024 M +Parameter: perceiver.layers.3.norm_latents.bias, Size: 0.001024 M +Parameter: perceiver.layers.3.to_q.weight, Size: 0.524288 M +Parameter: perceiver.layers.3.to_kv.weight, Size: 1.048576 M +Parameter: perceiver.layers.3.to_out.weight, Size: 0.524288 M +Parameter: perceiver.layers.3.feed_forward.0.weight, Size: 0.001024 M +Parameter: perceiver.layers.3.feed_forward.0.bias, Size: 0.001024 M +Parameter: perceiver.layers.3.feed_forward.1.weight, Size: 4.194304 M +Parameter: perceiver.layers.3.feed_forward.3.weight, Size: 4.194304 M +Parameter: perceiver.layers.4.norm_media.weight, Size: 0.001024 M +Parameter: perceiver.layers.4.norm_media.bias, Size: 0.001024 M +Parameter: perceiver.layers.4.norm_latents.weight, Size: 0.001024 M +Parameter: perceiver.layers.4.norm_latents.bias, Size: 0.001024 M +Parameter: perceiver.layers.4.to_q.weight, Size: 0.524288 M +Parameter: perceiver.layers.4.to_kv.weight, Size: 1.048576 M +Parameter: perceiver.layers.4.to_out.weight, Size: 0.524288 M +Parameter: perceiver.layers.4.feed_forward.0.weight, Size: 0.001024 M +Parameter: perceiver.layers.4.feed_forward.0.bias, Size: 0.001024 M +Parameter: perceiver.layers.4.feed_forward.1.weight, Size: 4.194304 M +Parameter: perceiver.layers.4.feed_forward.3.weight, Size: 4.194304 M +Parameter: perceiver.layers.5.norm_media.weight, Size: 0.001024 M +Parameter: perceiver.layers.5.norm_media.bias, Size: 0.001024 M +Parameter: perceiver.layers.5.norm_latents.weight, Size: 0.001024 M +Parameter: perceiver.layers.5.norm_latents.bias, Size: 0.001024 M +Parameter: perceiver.layers.5.to_q.weight, Size: 0.524288 M +Parameter: perceiver.layers.5.to_kv.weight, Size: 1.048576 M +Parameter: perceiver.layers.5.to_out.weight, Size: 0.524288 M +Parameter: perceiver.layers.5.feed_forward.0.weight, Size: 0.001024 M +Parameter: perceiver.layers.5.feed_forward.0.bias, Size: 0.001024 M +Parameter: perceiver.layers.5.feed_forward.1.weight, Size: 4.194304 M +Parameter: perceiver.layers.5.feed_forward.3.weight, Size: 4.194304 M +Parameter: perceiver.norm.weight, Size: 0.001024 M +Parameter: perceiver.norm.bias, Size: 0.001024 M +Total Trainable param: 7.917275 B +Imported class: + +DATASET: MMEDataset +-------------------- +=========== Cognition =========== +total score: 292.8571428571429 + code_reasoning score: 57.5 + numerical_calculation score: 50.0 + text_translation score: 92.5 + commonsense_reasoning score: 92.85714285714286 +=========== Perception =========== +total score: 962.7440976390556 + artwork score: 61.25 + celebrity score: 72.3529411764706 + count score: 88.33333333333334 + color score: 71.66666666666667 + position score: 56.666666666666664 + OCR score: 102.5 + landmark score: 119.75 + scene score: 154.0 + existence score: 175.0 + posters score: 61.224489795918366 + +-------------------------------------------------------------------------------- +Total Datasets Evaluated: 1 + +================================================================================ diff --git a/eval_results/eval_results_x b/eval_results/eval_results_x new file mode 100644 index 00000000..bacedf33 --- /dev/null +++ b/eval_results/eval_results_x @@ -0,0 +1,182 @@ +================================================================================ + EVALUATION REPORT +================================================================================ + + +MODEL INFO: {'name': 'otter_image_llava', 'model_path': '/mnt/petrelfs/zhangyuanhan/Otter/checkpoints/otter_llava_sft_nonconv_nogroup/epoch_1/'} +-------------------------------------------------------------------------------- +[2023-12-25 21:17:32,881] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect) +Imported class: +The current model version is configured for Otter-Image with max_num_frames set to None. +Parameter: lang_encoder.model.embed_tokens.weight, Size: 131.084288 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.attn_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.ff_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.attn.norm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.attn.norm.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.attn.to_q.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.attn.to_kv.weight, Size: 1.048576 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.attn.to_out.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.feed_forward.0.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.feed_forward.0.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.feed_forward.1.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.feed_forward.3.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.attn_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.ff_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.attn.norm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.attn.norm.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.attn.to_q.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.attn.to_kv.weight, Size: 1.048576 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.attn.to_out.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.feed_forward.0.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.feed_forward.0.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.feed_forward.1.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.7.gated_cross_attn_layer.feed_forward.3.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.attn_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.ff_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.attn.norm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.attn.norm.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.attn.to_q.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.attn.to_kv.weight, Size: 1.048576 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.attn.to_out.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.feed_forward.0.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.feed_forward.0.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.feed_forward.1.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.11.gated_cross_attn_layer.feed_forward.3.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.attn_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.ff_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.attn.norm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.attn.norm.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.attn.to_q.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.attn.to_kv.weight, Size: 1.048576 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.attn.to_out.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.feed_forward.0.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.feed_forward.0.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.feed_forward.1.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.15.gated_cross_attn_layer.feed_forward.3.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.attn_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.ff_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.attn.norm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.attn.norm.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.attn.to_q.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.attn.to_kv.weight, Size: 1.048576 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.attn.to_out.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.feed_forward.0.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.feed_forward.0.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.feed_forward.1.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.19.gated_cross_attn_layer.feed_forward.3.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.attn_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.ff_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.attn.norm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.attn.norm.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.attn.to_q.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.attn.to_kv.weight, Size: 1.048576 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.attn.to_out.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.feed_forward.0.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.feed_forward.0.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.feed_forward.1.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.23.gated_cross_attn_layer.feed_forward.3.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.attn_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.ff_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.attn.norm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.attn.norm.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.attn.to_q.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.attn.to_kv.weight, Size: 1.048576 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.attn.to_out.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.feed_forward.0.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.feed_forward.0.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.feed_forward.1.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.27.gated_cross_attn_layer.feed_forward.3.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.attn_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.ff_gate, Size: 0.000001 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.attn.norm.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.attn.norm.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.attn.to_q.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.attn.to_kv.weight, Size: 1.048576 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.attn.to_out.weight, Size: 2.097152 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.feed_forward.0.weight, Size: 0.004096 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.feed_forward.0.bias, Size: 0.004096 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.feed_forward.1.weight, Size: 67.108864 M +Parameter: lang_encoder.model.layers.31.gated_cross_attn_layer.feed_forward.3.weight, Size: 67.108864 M +Parameter: lang_encoder.lm_head.weight, Size: 131.084288 M +Parameter: perceiver.latents, Size: 0.065536 M +Parameter: perceiver.layers.0.norm_media.weight, Size: 0.001024 M +Parameter: perceiver.layers.0.norm_media.bias, Size: 0.001024 M +Parameter: perceiver.layers.0.norm_latents.weight, Size: 0.001024 M +Parameter: perceiver.layers.0.norm_latents.bias, Size: 0.001024 M +Parameter: perceiver.layers.0.to_q.weight, Size: 0.524288 M +Parameter: perceiver.layers.0.to_kv.weight, Size: 1.048576 M +Parameter: perceiver.layers.0.to_out.weight, Size: 0.524288 M +Parameter: perceiver.layers.0.feed_forward.0.weight, Size: 0.001024 M +Parameter: perceiver.layers.0.feed_forward.0.bias, Size: 0.001024 M +Parameter: perceiver.layers.0.feed_forward.1.weight, Size: 4.194304 M +Parameter: perceiver.layers.0.feed_forward.3.weight, Size: 4.194304 M +Parameter: perceiver.layers.1.norm_media.weight, Size: 0.001024 M +Parameter: perceiver.layers.1.norm_media.bias, Size: 0.001024 M +Parameter: perceiver.layers.1.norm_latents.weight, Size: 0.001024 M +Parameter: perceiver.layers.1.norm_latents.bias, Size: 0.001024 M +Parameter: perceiver.layers.1.to_q.weight, Size: 0.524288 M +Parameter: perceiver.layers.1.to_kv.weight, Size: 1.048576 M +Parameter: perceiver.layers.1.to_out.weight, Size: 0.524288 M +Parameter: perceiver.layers.1.feed_forward.0.weight, Size: 0.001024 M +Parameter: perceiver.layers.1.feed_forward.0.bias, Size: 0.001024 M +Parameter: perceiver.layers.1.feed_forward.1.weight, Size: 4.194304 M +Parameter: perceiver.layers.1.feed_forward.3.weight, Size: 4.194304 M +Parameter: perceiver.layers.2.norm_media.weight, Size: 0.001024 M +Parameter: perceiver.layers.2.norm_media.bias, Size: 0.001024 M +Parameter: perceiver.layers.2.norm_latents.weight, Size: 0.001024 M +Parameter: perceiver.layers.2.norm_latents.bias, Size: 0.001024 M +Parameter: perceiver.layers.2.to_q.weight, Size: 0.524288 M +Parameter: perceiver.layers.2.to_kv.weight, Size: 1.048576 M +Parameter: perceiver.layers.2.to_out.weight, Size: 0.524288 M +Parameter: perceiver.layers.2.feed_forward.0.weight, Size: 0.001024 M +Parameter: perceiver.layers.2.feed_forward.0.bias, Size: 0.001024 M +Parameter: perceiver.layers.2.feed_forward.1.weight, Size: 4.194304 M +Parameter: perceiver.layers.2.feed_forward.3.weight, Size: 4.194304 M +Parameter: perceiver.layers.3.norm_media.weight, Size: 0.001024 M +Parameter: perceiver.layers.3.norm_media.bias, Size: 0.001024 M +Parameter: perceiver.layers.3.norm_latents.weight, Size: 0.001024 M +Parameter: perceiver.layers.3.norm_latents.bias, Size: 0.001024 M +Parameter: perceiver.layers.3.to_q.weight, Size: 0.524288 M +Parameter: perceiver.layers.3.to_kv.weight, Size: 1.048576 M +Parameter: perceiver.layers.3.to_out.weight, Size: 0.524288 M +Parameter: perceiver.layers.3.feed_forward.0.weight, Size: 0.001024 M +Parameter: perceiver.layers.3.feed_forward.0.bias, Size: 0.001024 M +Parameter: perceiver.layers.3.feed_forward.1.weight, Size: 4.194304 M +Parameter: perceiver.layers.3.feed_forward.3.weight, Size: 4.194304 M +Parameter: perceiver.layers.4.norm_media.weight, Size: 0.001024 M +Parameter: perceiver.layers.4.norm_media.bias, Size: 0.001024 M +Parameter: perceiver.layers.4.norm_latents.weight, Size: 0.001024 M +Parameter: perceiver.layers.4.norm_latents.bias, Size: 0.001024 M +Parameter: perceiver.layers.4.to_q.weight, Size: 0.524288 M +Parameter: perceiver.layers.4.to_kv.weight, Size: 1.048576 M +Parameter: perceiver.layers.4.to_out.weight, Size: 0.524288 M +Parameter: perceiver.layers.4.feed_forward.0.weight, Size: 0.001024 M +Parameter: perceiver.layers.4.feed_forward.0.bias, Size: 0.001024 M +Parameter: perceiver.layers.4.feed_forward.1.weight, Size: 4.194304 M +Parameter: perceiver.layers.4.feed_forward.3.weight, Size: 4.194304 M +Parameter: perceiver.layers.5.norm_media.weight, Size: 0.001024 M +Parameter: perceiver.layers.5.norm_media.bias, Size: 0.001024 M +Parameter: perceiver.layers.5.norm_latents.weight, Size: 0.001024 M +Parameter: perceiver.layers.5.norm_latents.bias, Size: 0.001024 M +Parameter: perceiver.layers.5.to_q.weight, Size: 0.524288 M +Parameter: perceiver.layers.5.to_kv.weight, Size: 1.048576 M +Parameter: perceiver.layers.5.to_out.weight, Size: 0.524288 M +Parameter: perceiver.layers.5.feed_forward.0.weight, Size: 0.001024 M +Parameter: perceiver.layers.5.feed_forward.0.bias, Size: 0.001024 M +Parameter: perceiver.layers.5.feed_forward.1.weight, Size: 4.194304 M +Parameter: perceiver.layers.5.feed_forward.3.weight, Size: 4.194304 M +Parameter: perceiver.norm.weight, Size: 0.001024 M +Parameter: perceiver.norm.bias, Size: 0.001024 M +Total Trainable param: 1.441004 B +Imported class: + +DATASET: MMEDataset +-------------------- +=========== Cognition =========== +> /mnt/petrelfs/zhangyuanhan/Otter/pipeline/benchmarks/models/otter_image_llava.py(120)generate() +-> lang_x = self.model.text_tokenizer( +(Pdb) *** AttributeError: 'Tensor' object has no attribute 'shap' +(Pdb) torch.Size([1, 1, 1, 3, 336, 336]) +(Pdb) > /mnt/petrelfs/zhangyuanhan/Otter/pipeline/benchmarks/models/otter_image_llava.py(120)generate() +-> lang_x = self.model.text_tokenizer( +(Pdb) \ No newline at end of file diff --git a/src/otter_ai/models/otter/modeling_otter.py b/src/otter_ai/models/otter/modeling_otter.py index 7845fa76..fdad7601 100755 --- a/src/otter_ai/models/otter/modeling_otter.py +++ b/src/otter_ai/models/otter/modeling_otter.py @@ -32,6 +32,8 @@ import importlib.metadata as importlib_metadata import torch.distributed as dist +import re + def master_print(*args, **kwargs): @@ -802,18 +804,8 @@ def __init__( self.eoc_token_id = text_tokenizer.encode("<|endofchunk|>")[-1] self.media_token_id = text_tokenizer.encode("")[-1] - extend_instance(lang_encoder, OtterLMMixin) - decoder_layers_attr_name = _infer_decoder_layers_attr_name(lang_encoder) - lang_encoder.set_decoder_layers_attr_name(decoder_layers_attr_name) - # if lang_encoder.__class__.__name__ == "LlamaForCausalLM": - # lang_encoder.resize_token_embeddings(len(text_tokenizer)) - self.lang_encoder = lang_encoder - self.cross_attn_every_n_layers = config.cross_attn_every_n_layers - self.resampler_dim = config.resampler_dim if hasattr(config, "resampler_dim") else 64 - # use_media_placement_augmentation is strictly false for Otter model - self.use_media_placement_augmentation = False # config.use_media_placement_augmentation - self.max_num_frames = config.max_num_frames if hasattr(config, "max_num_frames") else None + self.llava_connection = True if hasattr(config, "llava_connection") else False # Informative master_print statement if self.max_num_frames is None or self.max_num_frames == 1: @@ -821,18 +813,45 @@ def __init__( else: master_print(f"The current model version is configured for Otter-Video with a maximum of {self.max_num_frames} frames.") + # use_media_placement_augmentation is strictly false for Otter model + self.use_media_placement_augmentation = False # config.use_media_placement_augmentation + + self.max_num_frames = config.max_num_frames if hasattr(config, "max_num_frames") else None + self.vis_dim = 1024 + + vision_encoder.output_tokens = True self.vision_encoder = vision_encoder - self.vis_dim = 1024 - self.perceiver = OtterPerceiverResampler(dim=self.vis_dim, max_num_frames=self.max_num_frames, dim_head = self.resampler_dim) - self.lang_encoder.init_otter( - media_token_id=self.media_token_id, - vis_hidden_size=self.vis_dim, - cross_attn_every_n_layers=self.cross_attn_every_n_layers, - use_media_placement_augmentation=self.use_media_placement_augmentation, - ) + if not self.llava_connection: + extend_instance(lang_encoder, OtterLMMixin) + decoder_layers_attr_name = _infer_decoder_layers_attr_name(lang_encoder) + lang_encoder.set_decoder_layers_attr_name(decoder_layers_attr_name) + self.cross_attn_every_n_layers = config.cross_attn_every_n_layers + self.resampler_dim = config.resampler_dim if hasattr(config, "resampler_dim") else 64 + self.perceiver = OtterPerceiverResampler(dim=self.vis_dim, max_num_frames=self.max_num_frames, dim_head = self.resampler_dim) + + self.lang_encoder.init_otter( + media_token_id=self.media_token_id, + vis_hidden_size=self.vis_dim, + cross_attn_every_n_layers=self.cross_attn_every_n_layers, + use_media_placement_augmentation=self.use_media_placement_augmentation, + ) + else: + mlp_gelu_match = re.match(r'^mlp(\d+)x_gelu$', config.llava_connection) + mlp_depth = int(mlp_gelu_match.group(1)) + modules = [nn.Linear(self.vis_dim, config.text_config.hidden_size)] + for _ in range(1, mlp_depth): + modules.append(nn.GELU()) + modules.append(nn.Linear(config.text_config.hidden_size, config.text_config.hidden_size)) + self.perceiver = nn.Sequential(*modules) + + # if lang_encoder.__class__.__name__ == "LlamaForCausalLM": + # lang_encoder.resize_token_embeddings(len(text_tokenizer)) + + self.lang_encoder = lang_encoder + if "lora_config" in config.__dict__: original_architecture_name = self.lang_encoder.__class__.__name__ @@ -965,19 +984,37 @@ def forward( else: # Case: do not use caching (i.e. this is a standard forward pass); - self._encode_vision_x(vision_x=vision_x) + if not self.llava_connection: + self._encode_vision_x(vision_x=vision_x) + else: + vision_x_input_embeds = self._encode_vision_x(vision_x=vision_x) # import pdb;pdb.set_trace() - output = self.lang_encoder( - input_ids=lang_x, - attention_mask=attention_mask, - labels=labels, - past_key_values=past_key_values, - use_cache=use_cache, - **kwargs, - ) - - if clear_conditioned_layers: + if not self.llava_connection: + output = self.lang_encoder( + input_ids=lang_x, + attention_mask=attention_mask, + labels=labels, + past_key_values=past_key_values, + use_cache=use_cache, + **kwargs, + ) + else: + lang_x_input_embeds = self.embed_tokens(lang_x) + image_attention_mask = torch.ones_like(vision_x_input_embeds, dtype=torch.bool) + attention_mask = torch.cat(image_attention_mask,attention_mask) + inputs_embeds = torch.cat(vision_x_input_embeds,lang_x_input_embeds) + output = self.lang_encoder( + input_ids=None, + attention_mask=attention_mask, + labels=labels, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + **kwargs, + ) + + if clear_conditioned_layers and not self.llava_connection:: self.lang_encoder.clear_conditioned_layers() return output @@ -1003,8 +1040,11 @@ def _encode_vision_x(self, vision_x: torch.Tensor): vision_x = self.perceiver(vision_x) # reshapes to (b, T, n, d) - for layer in self.lang_encoder._get_decoder_layers(): - layer.condition_vis_x(vision_x) + if not self.llava_connection: + for layer in self.lang_encoder._get_decoder_layers(): + layer.condition_vis_x(vision_x) + else: + return vision_x @torch.no_grad() def generate( @@ -1040,13 +1080,27 @@ def generate( num_beams = generate_kwargs.get("num_beams", 1) if num_beams > 1: vision_x = vision_x.repeat_interleave(num_beams, dim=0) - self._encode_vision_x(vision_x=vision_x) - output = self.lang_encoder.generate( - input_ids=lang_x, - attention_mask=attention_mask, - eos_token_id=self.eoc_token_id, - **generate_kwargs, - ) + if not self.llava_connection: + self._encode_vision_x(vision_x=vision_x) + output = self.lang_encoder.generate( + input_ids=lang_x, + attention_mask=attention_mask, + eos_token_id=self.eoc_token_id, + **generate_kwargs, + ) + self.lang_encoder.clear_conditioned_layers() + else: + vision_x_input_embeds = self._encode_vision_x(vision_x=vision_x) + lang_x_input_embeds = self.embed_tokens(lang_x) + image_attention_mask = torch.ones_like(vision_x_input_embeds, dtype=torch.bool) + attention_mask = torch.cat(image_attention_mask,attention_mask) + inputs_embeds = torch.cat(vision_x_input_embeds,lang_x_input_embeds) + output = self.lang_encoder.generate( + input_ids=None, + attention_mask=attention_mask, + inputs_embeds=inputs_embeds, + eos_token_id=self.eoc_token_id, + **generate_kwargs, + ) - self.lang_encoder.clear_conditioned_layers() return output