You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
fromPILimportImageimporttorchimportosfromllava.serve.classes.Utilsimport*fromllava.serve.classes.Compilerimport*fromllava.model.builderimportload_mixed_pretrained_modelfromllava.utilsimportdisable_torch_initfromllava.mm_utilsimporttokenizer_image_token, get_model_name_from_path, KeywordsStoppingCriteriadefrender_content_with_text(key, value):
ifFILL_WITH_RANDOM_TEXT:
ifkey.find("btn") !=-1:
value=value.replace(TEXT_PLACE_HOLDER, Utils.get_random_text())
elifkey.find("title") !=-1:
value=value.replace(TEXT_PLACE_HOLDER, Utils.get_random_text(length_text=5, space_number=0))
elifkey.find("text") !=-1:
value=value.replace(TEXT_PLACE_HOLDER,
Utils.get_random_text(length_text=56, space_number=7, with_upper_case=False))
returnvalueFILL_WITH_RANDOM_TEXT=TrueTEXT_PLACE_HOLDER="[]"model_path="/cm/CodeFuse-VLM-14B/CodeFuse-VLM-14B/"model_base=Nonemodel_name="qwen-vl-14b"vision_tower_path=os.path.join(model_path, 'Qwen-VL-visual')
mm_projector_type="cross_attn"mm_projector_path=os.path.join(model_path, 'mm_projector/mm_projector.bin')
disable_torch_init()
tokenizer, model, image_processor, context_len=load_mixed_pretrained_model(model_path, model_base, model_name,
vision_tower_path, mm_projector_type,
mm_projector_path, device_map="auto")
compiler=Compiler("/cm/CodeFuse-VLM-14B/CodeFuse-MFT-VLM/llava/serve/assets/web-dsl-mapping.json")
tokenizer.pad_token_id=tokenizer.eod_id# model = model.cuda()image_fn="/cm/CodeFuse-VLM-14B/image_test_vlm.png"image=Image.open(image_fn).convert('RGB')
image_tensor=image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
importpdb; pdb.set_trace()
# image.show()definference(prompt):
inputs="<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"prompt="<|im_start|>user\n"+"Picture 1:\n"+prompt+"<|im_end|>\n"+"<|im_start|>assistant\n"inputs+=prompttokens=tokenizer(
inputs,
max_length=tokenizer.model_max_length,
padding=True,
truncation=True,
return_tensors="pt",
)
input_ids=tokens.input_ids.cuda()
stop_str=tokenizer.pad_tokenkeywords= [stop_str]
stopping_criteria=KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
withtorch.inference_mode():
output_ids=model.generate(
input_ids,
images=image_tensor.unsqueeze(0).cuda(),
do_sample=True,
temperature=0.2,
top_p=0.3,
top_k=0,
max_new_tokens=2048,
return_dict_in_generate=False,
use_cache=True)
input_token_len=input_ids.shape[1]
n_diff_input_output= (input_ids!=output_ids[:, :input_token_len]).sum().item()
ifn_diff_input_output>0:
print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids')
output_text=tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0]
returnoutput_textret=inference("Please generate DSL for the skect on this image:\n")
print(ret)
output_text=ret.replace("<|im_end|>", "").replace("<|im_start|>", "").replace("\n", "")
output_text=output_text.lower()
output_text=output_text.replace("{", "{\n").replace("}", "\n}\n").replace("\n\n", '\n').rstrip("\n")
output_html=compiler.compile(output_text, None, rendering_function=render_content_with_text)
print(output_html)
then the model returns:
I'm sorry, but I cannot see the image you are referring to. Please provide a detailed description of the image or upload the image itself so that I can generate the DSL for you.
The text was updated successfully, but these errors were encountered:
After reading source code prepare_inputs_labels_for_multimodal function in llava/model/language_model/llava_qwen.py I add some special tokens in the prompt.
definference(prompt):
inputs="<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"prompt="<|im_start|>user\n"+"Picture 1:<img></img>\n"+prompt+"<|im_end|>\n"+"<|im_start|>assistant\n"# ...
The model can generate DSL as expected when I used the picture in demo.ipynb.
But the model can not generate correct DSL when I pass another picture to model. The generated content is very weird.
the picture I use:
The content model generated:
:新創建的職位
the prompt args for inference function: Please generate DSL for the skect on this image:\n
CODE:
then the model returns:
I'm sorry, but I cannot see the image you are referring to. Please provide a detailed description of the image or upload the image itself so that I can generate the DSL for you.
The text was updated successfully, but these errors were encountered: