Follow the code in demo.ipynb but model cannot see the image #4

CatYing · 2024-08-22T03:33:26Z

CODE:

from PIL import Image
import torch
import os
from llava.serve.classes.Utils import *
from llava.serve.classes.Compiler import *


from llava.model.builder import load_mixed_pretrained_model
from llava.utils import disable_torch_init
from llava.mm_utils import tokenizer_image_token, get_model_name_from_path, KeywordsStoppingCriteria


def render_content_with_text(key, value):
    if FILL_WITH_RANDOM_TEXT:
        if key.find("btn") != -1:
            value = value.replace(TEXT_PLACE_HOLDER, Utils.get_random_text())
        elif key.find("title") != -1:
            value = value.replace(TEXT_PLACE_HOLDER, Utils.get_random_text(length_text=5, space_number=0))
        elif key.find("text") != -1:
            value = value.replace(TEXT_PLACE_HOLDER,
                                  Utils.get_random_text(length_text=56, space_number=7, with_upper_case=False))
    return value


FILL_WITH_RANDOM_TEXT = True
TEXT_PLACE_HOLDER = "[]"

model_path = "/cm/CodeFuse-VLM-14B/CodeFuse-VLM-14B/"
model_base = None
model_name = "qwen-vl-14b"
vision_tower_path = os.path.join(model_path, 'Qwen-VL-visual')
mm_projector_type = "cross_attn"
mm_projector_path = os.path.join(model_path, 'mm_projector/mm_projector.bin')

disable_torch_init()
tokenizer, model, image_processor, context_len = load_mixed_pretrained_model(model_path, model_base, model_name,
                                                                             vision_tower_path, mm_projector_type,
                                                                             mm_projector_path, device_map="auto")

compiler = Compiler("/cm/CodeFuse-VLM-14B/CodeFuse-MFT-VLM/llava/serve/assets/web-dsl-mapping.json")

tokenizer.pad_token_id = tokenizer.eod_id
# model = model.cuda()

image_fn = "/cm/CodeFuse-VLM-14B/image_test_vlm.png"

image = Image.open(image_fn).convert('RGB')
image_tensor = image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
import pdb; pdb.set_trace()

# image.show()


def inference(prompt):
    inputs = "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
    prompt = "<|im_start|>user\n" + "Picture 1:\n" + prompt + "<|im_end|>\n" + "<|im_start|>assistant\n"
    inputs += prompt

    tokens = tokenizer(
        inputs,
        max_length=tokenizer.model_max_length,
        padding=True,
        truncation=True,
        return_tensors="pt",
    )
    input_ids = tokens.input_ids.cuda()

    stop_str = tokenizer.pad_token
    keywords = [stop_str]
    stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
    with torch.inference_mode():
        output_ids = model.generate(
            input_ids,
            images=image_tensor.unsqueeze(0).cuda(),
            do_sample=True,
            temperature=0.2,
            top_p=0.3,
            top_k=0,
            max_new_tokens=2048,
            return_dict_in_generate=False,
            use_cache=True)
    input_token_len = input_ids.shape[1]
    n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item()
    if n_diff_input_output > 0:
        print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids')
    output_text = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0]
    return output_text


ret = inference("Please generate DSL for the skect on this image:\n")
print(ret)

output_text = ret.replace("<|im_end|>", "").replace("<|im_start|>", "").replace("\n", "")
output_text = output_text.lower()
output_text = output_text.replace("{", "{\n").replace("}", "\n}\n").replace("\n\n", '\n').rstrip("\n")


output_html = compiler.compile(output_text, None, rendering_function=render_content_with_text)
print(output_html)

then the model returns:

I'm sorry, but I cannot see the image you are referring to. Please provide a detailed description of the image or upload the image itself so that I can generate the DSL for you.

CatYing · 2024-08-22T09:08:26Z

After reading source code prepare_inputs_labels_for_multimodal function in llava/model/language_model/llava_qwen.py I add some special tokens in the prompt.

def inference(prompt):
    inputs = "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
    prompt = "<|im_start|>user\n" + "Picture 1:<img></img>\n" + prompt + "<|im_end|>\n" + "<|im_start|>assistant\n"
    # ...

The model can generate DSL as expected when I used the picture in demo.ipynb.

But the model can not generate correct DSL when I pass another picture to model. The generated content is very weird.

the picture I use:

The content model generated:

：新創建的職位

the prompt args for inference function: Please generate DSL for the skect on this image:\n

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Follow the code in demo.ipynb but model cannot see the image #4

Follow the code in demo.ipynb but model cannot see the image #4

CatYing commented Aug 22, 2024 •

edited

Loading

CatYing commented Aug 22, 2024

Follow the code in demo.ipynb but model cannot see the image #4

Follow the code in demo.ipynb but model cannot see the image #4

Comments

CatYing commented Aug 22, 2024 • edited Loading

CatYing commented Aug 22, 2024

CatYing commented Aug 22, 2024 •

edited

Loading