-
Notifications
You must be signed in to change notification settings - Fork 1
/
translate_image.py
132 lines (107 loc) · 4.93 KB
/
translate_image.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
import json
import re
from typing import AsyncGenerator
import google.generativeai as genai
from google.ai.generativelanguage_v1beta.types.content import Blob, Content, Part
from google.generativeai.types import HarmBlockThreshold, HarmCategory
def initialize_genai(api_key: str, model: str = "gemini-1.5-flash-002"):
genai.configure(api_key=api_key)
return genai.GenerativeModel(model)
def get_prompt() -> str:
return """
Role: Professional Image Text Recognizer and Translator
Languages:
- Image Text: Automatically detect (Japanese or English)
- Translation: Translate to the other language (English or Japanese)
Instructions:
1. Accurately transcribe the text in the image, detecting whether it's in Japanese or English.
2. Preserve the original text format and structure:
- Maintain bullet points, numbered lists, and other formatting elements.
- Keep line breaks and paragraph structures intact.
- Preserve any special characters or symbols used for formatting.
3. Refine the transcription:
- Retain all meaningful punctuation.
- Accurately capture any emphasis (bold, italic, underline) if discernible.
4. Translate the transcribed text to the other language (Japanese to English or English to Japanese).
5. In the translation:
- Maintain the original formatting, including lists and line breaks.
- Preserve the tone, style, and intent of the original text.
- Adapt idiomatic expressions and cultural nuances appropriately.
6. Ensure both the transcription and translation accurately reflect the original image text in content and format.
7. Always provide both the original text and its translation, regardless of the detected language.
8. Output the result in the following JSON format:
```json
{
"detected_language": "The detected language (either 'ja' or 'en')",
"ja": "The Japanese text (either transcription or translation)",
"en": "The English text (either transcription or translation)"
}
```
"""
def prepare_contents(prompt: str, image: bytes) -> list[Content]:
return [
Content(role="user", parts=[Part(text=prompt)]),
Content(
role="user",
parts=[Part(inline_data=Blob(mime_type="image/jpeg", data=image))],
)
]
def get_generation_config():
return genai.GenerationConfig(
temperature=0,
response_mime_type="application/json",
response_schema={
"type": "object",
"properties": {
"detected_language": {"type": "string", "enum": ["ja", "en"]},
"ja": {"type": "string"},
"en": {"type": "string"},
},
"required": ["detected_language", "ja", "en"],
}
)
def get_safety_settings():
return {
HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,
HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
}
async def transcribe_and_translate_image_stream(gen_model: genai.GenerativeModel, image: bytes) -> AsyncGenerator[tuple[str, str, str], None]:
prompt = get_prompt()
contents = prepare_contents(prompt, image)
res = await gen_model.generate_content_async(
contents=contents,
generation_config=get_generation_config(),
safety_settings=get_safety_settings(),
stream=True,
)
async for detected_lang, ja_text, en_text in process_response_stream(res):
yield detected_lang, ja_text, en_text
async def process_response_stream(res) -> AsyncGenerator[tuple[str, str, str], None]:
all_text = ""
partial_result = {"detected_language": "", "ja": "", "en": ""}
key_patterns = {
"detected_language": r'"detected_language"\s*:\s*"(ja|en)"',
"ja": r'"ja"\s*:\s*"((?:[^"]|\\")*)',
"en": r'"en"\s*:\s*"((?:[^"]|\\")*)'
}
async for chunk in res:
if chunk.text:
all_text += chunk.text
update_partial_result(all_text, partial_result, key_patterns)
yield partial_result["detected_language"], partial_result["ja"], partial_result["en"]
yield_final_result(res)
def update_partial_result(all_text: str, partial_result: dict, key_patterns: dict):
for key, pattern in key_patterns.items():
match = re.search(pattern, all_text)
if match:
value = match.group(1)
value = value.replace('\\"', '"').replace('\\n', '\n').replace('\\\\', '\\')
partial_result[key] = value
def yield_final_result(res):
try:
final_json = json.loads(res.text)
yield final_json["detected_language"], final_json["ja"], final_json["en"]
except json.JSONDecodeError:
raise Exception("failed to decode final json")