Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Follow the code in demo.ipynb but model cannot see the image #4

Open
CatYing opened this issue Aug 22, 2024 · 1 comment
Open

Follow the code in demo.ipynb but model cannot see the image #4

CatYing opened this issue Aug 22, 2024 · 1 comment

Comments

@CatYing
Copy link

CatYing commented Aug 22, 2024

CODE:

from PIL import Image
import torch
import os
from llava.serve.classes.Utils import *
from llava.serve.classes.Compiler import *


from llava.model.builder import load_mixed_pretrained_model
from llava.utils import disable_torch_init
from llava.mm_utils import tokenizer_image_token, get_model_name_from_path, KeywordsStoppingCriteria


def render_content_with_text(key, value):
    if FILL_WITH_RANDOM_TEXT:
        if key.find("btn") != -1:
            value = value.replace(TEXT_PLACE_HOLDER, Utils.get_random_text())
        elif key.find("title") != -1:
            value = value.replace(TEXT_PLACE_HOLDER, Utils.get_random_text(length_text=5, space_number=0))
        elif key.find("text") != -1:
            value = value.replace(TEXT_PLACE_HOLDER,
                                  Utils.get_random_text(length_text=56, space_number=7, with_upper_case=False))
    return value


FILL_WITH_RANDOM_TEXT = True
TEXT_PLACE_HOLDER = "[]"

model_path = "/cm/CodeFuse-VLM-14B/CodeFuse-VLM-14B/"
model_base = None
model_name = "qwen-vl-14b"
vision_tower_path = os.path.join(model_path, 'Qwen-VL-visual')
mm_projector_type = "cross_attn"
mm_projector_path = os.path.join(model_path, 'mm_projector/mm_projector.bin')

disable_torch_init()
tokenizer, model, image_processor, context_len = load_mixed_pretrained_model(model_path, model_base, model_name,
                                                                             vision_tower_path, mm_projector_type,
                                                                             mm_projector_path, device_map="auto")

compiler = Compiler("/cm/CodeFuse-VLM-14B/CodeFuse-MFT-VLM/llava/serve/assets/web-dsl-mapping.json")

tokenizer.pad_token_id = tokenizer.eod_id
# model = model.cuda()

image_fn = "/cm/CodeFuse-VLM-14B/image_test_vlm.png"

image = Image.open(image_fn).convert('RGB')
image_tensor = image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
import pdb; pdb.set_trace()

# image.show()


def inference(prompt):
    inputs = "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
    prompt = "<|im_start|>user\n" + "Picture 1:\n" + prompt + "<|im_end|>\n" + "<|im_start|>assistant\n"
    inputs += prompt

    tokens = tokenizer(
        inputs,
        max_length=tokenizer.model_max_length,
        padding=True,
        truncation=True,
        return_tensors="pt",
    )
    input_ids = tokens.input_ids.cuda()

    stop_str = tokenizer.pad_token
    keywords = [stop_str]
    stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
    with torch.inference_mode():
        output_ids = model.generate(
            input_ids,
            images=image_tensor.unsqueeze(0).cuda(),
            do_sample=True,
            temperature=0.2,
            top_p=0.3,
            top_k=0,
            max_new_tokens=2048,
            return_dict_in_generate=False,
            use_cache=True)
    input_token_len = input_ids.shape[1]
    n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item()
    if n_diff_input_output > 0:
        print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids')
    output_text = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0]
    return output_text


ret = inference("Please generate DSL for the skect on this image:\n")
print(ret)

output_text = ret.replace("<|im_end|>", "").replace("<|im_start|>", "").replace("\n", "")
output_text = output_text.lower()
output_text = output_text.replace("{", "{\n").replace("}", "\n}\n").replace("\n\n", '\n').rstrip("\n")


output_html = compiler.compile(output_text, None, rendering_function=render_content_with_text)
print(output_html)

then the model returns:

I'm sorry, but I cannot see the image you are referring to. Please provide a detailed description of the image or upload the image itself so that I can generate the DSL for you.

@CatYing
Copy link
Author

CatYing commented Aug 22, 2024

After reading source code prepare_inputs_labels_for_multimodal function in llava/model/language_model/llava_qwen.py I add some special tokens in the prompt.

def inference(prompt):
    inputs = "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
    prompt = "<|im_start|>user\n" + "Picture 1:<img></img>\n" + prompt + "<|im_end|>\n" + "<|im_start|>assistant\n"
    # ...

The model can generate DSL as expected when I used the picture in demo.ipynb.

But the model can not generate correct DSL when I pass another picture to model. The generated content is very weird.

the picture I use:
image_test_vlm

The content model generated:

:新創建的職位

the prompt args for inference function: Please generate DSL for the skect on this image:\n

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant