-
Notifications
You must be signed in to change notification settings - Fork 0
/
image_captions_florence.py
70 lines (54 loc) · 2.66 KB
/
image_captions_florence.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
import os
from transformers import AutoModelForCausalLM, AutoProcessor
from PIL import Image
import torch
from tools.caption_starters import clean_caption
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModelForCausalLM.from_pretrained("gokaygokay/Florence-2-Flux-Large", trust_remote_code=True).to(device).eval()
processor = AutoProcessor.from_pretrained("gokaygokay/Florence-2-Flux-Large", trust_remote_code=True)
def run_example(task_prompt, text_input, image):
prompt = task_prompt + text_input
# Ensure the image is in RGB mode
if image.mode != "RGB":
image = image.convert("RGB")
inputs = processor(text=prompt, images=image, return_tensors="pt").to(device)
generated_ids = model.generate(
input_ids=inputs["input_ids"],
pixel_values=inputs["pixel_values"],
max_new_tokens=1024,
num_beams=3,
repetition_penalty=1.10,
)
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
parsed_answer = processor.post_process_generation(generated_text, task=task_prompt, image_size=(image.width, image.height))
return parsed_answer
def process_image(folder_path):
for filename in os.listdir(folder_path):
if filename.lower().endswith(('.jpg', '.png', '.jpeg', '.gif', '.bmp', '.tiff')):
image_path = os.path.join(folder_path, filename)
txt_path = os.path.splitext(image_path)[0] + ".txt"
# Check if a corresponding text file exists
if os.path.exists(txt_path):
if os.path.getsize(txt_path) == 0: # if the text file is empty
os.remove(txt_path) # delete the text file
print(f"Deleted empty file: {txt_path}")
else:
print(f"Skipping {filename}: Text file already exists")
continue
image = Image.open(image_path)
answer = run_example("<DESCRIPTION>", "Describe this image and art style.", image)
caption = answer["<DESCRIPTION>"]
# clean the caption
cleaned_caption = clean_caption(caption)
# Write the caption to a text file in the same directory
with open(txt_path, 'w', encoding='utf-8') as f:
f.write(cleaned_caption)
print(f"{filename}: {cleaned_caption} -> {txt_path}")
def main():
folder_path = input("Enter the path to the folder containing images: ").strip()
if not os.path.isdir(folder_path):
print(f"Error: '{folder_path}' is not a valid directory.")
return
process_image(folder_path)
if __name__ == '__main__':
main()