diff --git a/README.md b/README.md index 88de5820..27e6e4b8 100644 --- a/README.md +++ b/README.md @@ -6,32 +6,129 @@ ![Contributors](https://img.shields.io/github/contributors/zyddnys/manga-image-translator) [![Discord](https://img.shields.io/discord/739305951085199490?logo=discord&label=discord&logoColor=white)](https://discord.gg/Ak8APNy4vb) + > Translate texts in manga/images.\ > [中文说明](README_CN.md) | [Change Log](CHANGELOG.md) \ > Join us on discord -Some manga/images will never be translated, therefore this project is born.\ -Primarily designed for translating Japanese text, but also supports Chinese, English and Korean.\ -Supports inpainting and text rendering.\ -Successor to \ -Also check out GUI implementation: - -**This is a hobby project, you are welcome to contribute!**\ -Currently this only a simple demo, many imperfections exist, we need your support to make this project better! -Samples are found under [here](#samples). - -## Support Us +Some manga/images will never be translated, therefore this project is born. + +- [Image/Manga Translator](#imagemanga-translator) + - [Samples](#samples) + - [Online Demo](#online-demo) + - [Disclaimer](#disclaimer) + - [Installation](#installation) + - [Pip/venv](#pipvenv) + - [Poetry](#poetry) + - [Additional instructions for **Windows**](#additional-instructions-for-windows) + - [Docker](#docker) + - [Hosting the web server](#hosting-the-web-server) + - [Using as CLI](#using-as-cli) + - [Setting Translation Secrets](#setting-translation-secrets) + - [Using with Nvidia GPU](#using-with-nvidia-gpu) + - [Building locally](#building-locally) + - [Usage](#usage) + - [Batch mode (default)](#batch-mode-default) + - [Demo mode](#demo-mode) + - [Web Mode](#web-mode) + - [Api Mode](#api-mode) + - [Related Projects](#related-projects) + - [Docs](#docs) + - [Recommended Modules](#recommended-modules) + - [Tips to improve translation quality](#tips-to-improve-translation-quality) + - [Options](#options) + - [Language Code Reference](#language-code-reference) + - [Translators Reference](#translators-reference) + - [GPT Config Reference](#gpt-config-reference) + - [Using Gimp for rendering](#using-gimp-for-rendering) + - [Api Documentation](#api-documentation) + - [Synchronous mode](#synchronous-mode) + - [Asynchronous mode](#asynchronous-mode) + - [Manual translation](#manual-translation) + - [Next steps](#next-steps) + - [Support Us](#support-us) + - [Thanks To All Our Contributors :](#thanks-to-all-our-contributors-) -GPU server is not cheap, please consider to donate to us. +## Samples -- Ko-fi: -- Patreon: -- 爱发电: +Please note that the samples may not always be updated, they may not represent the current main branch version. - ### Thanks To All Our Contributors : - - - + + + + + + + + + + + + + + + + + + + + + + + + + +
OriginalTranslated
+ + 佐藤さんは知っていた - 猫麦 + +
+ (Source @09ra_19ra) +
+ + Output + +
+ (Mask) +
+ + Gris finds out she's of royal blood - VERTI + +
+ (Source @VERTIGRIS_ART) +
+ + Output + +
+ --detector ctd + (Mask) +
+ + 陰キャお嬢様の新学期🏫📔🌸 (#3) - ひづき夜宵🎀💜 + +
+ (Source @hiduki_yayoi) +
+ + Output + +
+ --translator none + (Mask) +
+ + 幼なじみの高校デビューの癖がすごい (#1) - 神吉李花☪️🐧 + +
+ (Source @rikak) +
+ + Output + +
+ (Mask) +
## Online Demo @@ -42,9 +139,16 @@ Browser Userscript (by QiroNT): In that case you can wait for me to restart the service, which may take up to 24 hrs. - Note this online demo is using the current main branch version. +## Disclaimer +Successor to [MMDOCR-HighPerformance](https://github.com/PatchyVideo/MMDOCR-HighPerformance).\ +**This is a hobby project, you are welcome to contribute!**\ +Currently this only a simple demo, many imperfections exist, we need your support to make this project better!\ +Primarily designed for translating Japanese text, but also supports Chinese, English and Korean.\ +Supports inpainting, text rendering and colorization. ## Installation -### Pip +### Pip/venv + ```bash # First, you need to have Python(>=3.8) installed on your system # The latest version often does not work with some pytorch libraries yet @@ -54,6 +158,12 @@ Python 3.10.6 # Clone this repo $ git clone https://github.com/zyddnys/manga-image-translator.git +# Create venv +$ python -m venv venv + +# Activate venv +$ source venv/bin/activate + # For --use-gpu option go to https://pytorch.org/ and follow # pytorch installation instructions. Add `--upgrade --force-reinstall` # to the pip command to overwrite the currently installed pytorch version. @@ -77,22 +187,105 @@ The models will be downloaded into `./models` at runtime. #### Additional instructions for **Windows** -Before you start the pip install, first install Microsoft C++ Build Tools ([Download](https://visualstudio.microsoft.com/vs/), +Before you start the pip install, first install Microsoft C++ Build +Tools ([Download](https://visualstudio.microsoft.com/vs/), [Instructions](https://stackoverflow.com/questions/40504552/how-to-install-visual-c-build-tools)) as some pip dependencies will not compile without it. (See [#114](https://github.com/zyddnys/manga-image-translator/issues/114)). To use [cuda](https://developer.nvidia.com/cuda-downloads?target_os=Windows&target_arch=x86_64) -on windows install the correct pytorch version as instructed on . +on windows install the correct pytorch version as instructed on . + +Also, if you have trouble installing pydensecrf with the command above you can install the pre-compiled wheels +with `pip install https://www.lfd.uci.edu/~gohlke/pythonlibs/#_pydensecrf`. + +### Docker + +Requirements: + +- Docker (version 19.03+ required for CUDA / GPU acceleration) +- Docker Compose (Optional if you want to use files in the `demo/doc` folder) +- Nvidia Container Runtime (Optional if you want to use CUDA) + +This project has docker support under `zyddnys/manga-image-translator:main` image. +This docker image contains all required dependencies / models for the project. +It should be noted that this image is fairly large (~ 15GB). + +#### Hosting the web server + +The web server can be hosted using (For CPU) + +```bash +docker run -p 5003:5003 -v result:/app/result --ipc=host --rm zyddnys/manga-image-translator:main -l ENG --manga2eng -v --mode web --host=0.0.0.0 --port=5003 +``` + +or + +```bash +docker-compose -f demo/doc/docker-compose-web-with-cpu.yml up +``` + +depending on which you prefer. The web server should start on port [5003](http://localhost:5003) +and images should become in the `/result` folder. + +#### Using as CLI + +To use docker with the CLI (I.e in batch mode) -Also, if you have trouble installing pydensecrf with the command above you can install the pre-compiled wheels with `pip install https://www.lfd.uci.edu/~gohlke/pythonlibs/#_pydensecrf`. +```bash +docker run -v :/app/ -v -translated:/app/-translated --ipc=host --rm zyddnys/manga-image-translator:main --mode=batch -i=/app/ +``` + +**Note:** In the event you need to reference files on your host machine +you will need to mount the associated files as volumes into the `/app` folder inside the container. +Paths for the CLI will need to be the internal docker path `/app/...` instead of the paths on your host machine + +#### Setting Translation Secrets + +Some translation services require API keys to function to set these pass them as env vars into the docker container. For +example: + +```bash +docker run --env="DEEPL_AUTH_KEY=xxx" --ipc=host --rm zyddnys/manga-image-translator:main +``` + +#### Using with Nvidia GPU + +> To use with a supported GPU please first read the initial `Docker` section. There are some special dependencies you +> will need to use + +To run the container with the following flags set: + +```bash +docker run ... --gpus=all ... zyddnys/manga-image-translator:main ... --use-gpu +``` + +Or (For the web server + GPU) + +```bash +docker-compose -f demo/doc/docker-compose-web-with-gpu.yml up +``` + +#### Building locally + +To build the docker image locally you can run (You will require make on your machine) + +```bash +make build-image +``` + +Then to test the built image run + +```bash +make run-web-server +``` ## Usage -#### Batch mode (default) +### Batch mode (default) ```bash -# use `--use-gpu` for speedup if you have a compatible NVIDIA GPU or using Apple Silicon. +# use `--use-gpu` for speedup if you have a compatible NVIDIA GPU. # use `--target-lang ` to specify a target language. # use `--inpainter=none` to disable inpainting. # use `--translator=none` if you only want to use inpainting (blank bubbles) @@ -101,7 +294,7 @@ $ python -m manga_translator -v --translator=google -l ENG -i # results can be found under `-translated`. ``` -#### Demo mode +### Demo mode ```bash # saves singular image into /result folder for demonstration purposes @@ -111,7 +304,7 @@ $ python -m manga_translator --mode demo -v --translator=google -l ENG -i # result can be found in `result/`. ``` -#### Web Mode +### Web Mode ```bash # use `--mode web` to start a web server. @@ -119,260 +312,56 @@ $ python -m manga_translator -v --mode web --use-gpu # the demo will be serving on http://127.0.0.1:5003 ``` -#### Manual translation - -Manual translation replaces machine translation with human translators. -Basic manual translation demo can be found at when using web mode. -
-API V2 -
+### Api Mode ```bash -# use `--mode api` to start a web server. +# use `--mode web` to start a web server. $ python -m manga_translator -v --mode api --use-gpu -# the api will be serving on http://127.0.0.1:5003 -``` -Api is accepting json(post) and multipart. -
-Api endpoints are `/colorize_translate`, `/inpaint_translate`, `/translate`, `/get_text`. -
-Valid arguments for the api are: +# the demo will be serving on http://127.0.0.1:5003 ``` -// These are taken from args.py. For more info see README.md -detector: String -ocr: String -inpainter: String -upscaler: String -translator: String -target_language: String -upscale_ratio: Integer -translator_chain: String -selective_translation: String -attempts: Integer -detection_size: Integer // 1024 => 'S', 1536 => 'M', 2048 => 'L', 2560 => 'X' -text_threshold: Float -box_threshold: Float -unclip_ratio: Float -inpainting_size: Integer -det_rotate: Bool -det_auto_rotate: Bool -det_invert: Bool -det_gamma_correct: Bool -min_text_length: Integer -colorization_size: Integer -denoise_sigma: Integer -mask_dilation_offset: Integer -ignore_bubble: Integer -gpt_config: String -filter_text: String -overlay_type: String +## Related Projects +GUI implementation: [BallonsTranslator](https://github.com/dmMaze/BallonsTranslator) -// These are api specific args -direction: String // {'auto', 'h', 'v'} -base64Images: String //Image in base64 format -image: Multipart // image upload from multipart -url: String // an url string -``` -
-
-API -
+## Docs -Two modes of translation service are provided by the demo: synchronous mode and asynchronous mode.\ -In synchronous mode your HTTP POST request will finish once the translation task is finished.\ -In asynchronous mode your HTTP POST request will respond with a `task_id` immediately, you can use this `task_id` to poll for translation task state. +### Recommended Modules +Detector: +- ENG: ?? +- JPN: ?? +- CHS: ?? +- KOR: ?? +- Using `--detector ctd` can increase the amount of text lines detected -#### Synchronous mode -1. POST a form request with form data `file:` to -2. Wait for response -3. Use the resultant `task_id` to find translation result in `result/` directory, e.g. using Nginx to expose `result/` +OCR: +- ENG: ?? +- JPN: ?? +- CHS: ?? +- KOR: 48px -#### Asynchronous mode +Translator: +- JPN -> ENG: **Sugoi** +- CHS -> ENG: ?? +- CHS -> JPN: ?? +- JPN -> CHS: ?? +- ENG -> JPN: ?? +- ENG -> CHS: ?? -1. POST a form request with form data `file:` to -2. Acquire translation `task_id` -3. Poll for translation task state by posting JSON `{"taskid": }` to -4. Translation is finished when the resultant state is either `finished`, `error` or `error-lang` -5. Find translation result in `result/` directory, e.g. using Nginx to expose `result/` +Inpainter: ?? -#### Manual translation +Colorizer: **mc2** -POST a form request with form data `file:` to -and wait for response. + -You will obtain a JSON response like this: +#### Tips to improve translation quality -```json -{ - "task_id": "12c779c9431f954971cae720eb104499", - "status": "pending", - "trans_result": [ - { - "s": "☆上司来ちゃった……", - "t": "" - } - ] -} -``` +- Small resolutions can sometimes trip up the detector, which is not so good at picking up irregular text sizes. To + circumvent this you can use an upscaler by specifying `--upscale-ratio 2` or any other value +- If the text being rendered is too small to read specify `--font-size-minimum 30` for instance or use the `--manga2eng` + renderer that will try to adapt to detected textbubbles +- Specify a font with `--font-path fonts/anime_ace_3.ttf` for example -Fill in translated texts: - -```json -{ - "task_id": "12c779c9431f954971cae720eb104499", - "status": "pending", - "trans_result": [ - { - "s": "☆上司来ちゃった……", - "t": "☆Boss is here..." - } - ] -} -``` - -Post translated JSON to and wait for response.\ -Then you can find the translation result in `result/` directory, e.g. using Nginx to expose `result/`. - -
- -#### Using Gimp for rendering - -When setting output format to {`xcf`, `psd`, `pdf`} Gimp will be used to generate the file. - -On Windows this assumes Gimp 2.x to be installed to `C:\Users\\AppData\Local\Programs\Gimp 2`. - -The resulting `.xcf` file contains the original image as the lowest layer and it has the inpainting as a separate layer. -The translated textboxes have their own layers with the original text as the layer name for easy access. - -Limitations: -- Gimp will turn text layers to regular images when saving `.psd` files. -- Rotated text isn't handled well in Gimp. When editing a rotated textbox it'll also show a popup that it was modified by an outside program. -- Font family is controlled separately, with the `--gimp-font` argument. - -### Translators Reference - -| Name | API Key | Offline | Note | -| ---------- | ------- | ------- | ------------------------------------------------------ | -| google | | | | -| youdao | ✔️ | | Requires `YOUDAO_APP_KEY` and `YOUDAO_SECRET_KEY` | -| baidu | ✔️ | | Requires `BAIDU_APP_ID` and `BAIDU_SECRET_KEY` | -| deepl | ✔️ | | Requires `DEEPL_AUTH_KEY` | -| caiyun | ✔️ | | Requires `CAIYUN_TOKEN` | -| gpt3 | ✔️ | | Implements text-davinci-003. Requires `OPENAI_API_KEY` | -| gpt3.5 | ✔️ | | Implements gpt-3.5-turbo. Requires `OPENAI_API_KEY` | -| gpt4 | ✔️ | | Implements gpt-4. Requires `OPENAI_API_KEY` | -| papago | | | | -| offline | | ✔️ | Chooses most suitable offline translator for language | -| sugoi | | ✔️ | Sugoi V4.0 Models (recommended for JPN->ENG) | -| m2m100 | | ✔️ | Supports every language | -| m2m100_big | | ✔️ | | -| mbart50 | | ✔️ | | -| none | | ✔️ | Translate to empty texts | -| original | | ✔️ | Keep original texts | - -- API Key: Whether the translator requires an API key to be set as environment variable. - For this you can create a .env file in the project root directory containing your api keys like so: - -```env -OPENAI_API_KEY=sk-xxxxxxx... -DEEPL_AUTH_KEY=xxxxxxxx... -``` - -- Offline: Whether the translator can be used offline. - -- Sugoi is created by mingshiba, please support him in https://www.patreon.com/mingshiba - -#### GPT Config Reference - -Used by the `--gpt-config` argument. - -```yaml -# The prompt being feed into GPT before the text to translate. -# Use {to_lang} to indicate where the target language name should be inserted. -# Note: ChatGPT models don't use this prompt. -prompt_template: > - Please help me to translate the following text from a manga to {to_lang} - (if it's already in {to_lang} or looks like gibberish you have to output it as it is instead):\n - -# What sampling temperature to use, between 0 and 2. -# Higher values like 0.8 will make the output more random, -# while lower values like 0.2 will make it more focused and deterministic. -temperature: 0.5 - -# An alternative to sampling with temperature, called nucleus sampling, -# where the model considers the results of the tokens with top_p probability mass. -# So 0.1 means only the tokens comprising the top 10% probability mass are considered. -top_p: 1 - -# The prompt being feed into ChatGPT before the text to translate. -# Use {to_lang} to indicate where the target language name should be inserted. -# Tokens used in this example: 57+ -chat_system_template: > - You are a professional translation engine, - please translate the story into a colloquial, - elegant and fluent content, - without referencing machine translations. - You must only translate the story, never interpret it. - If there is any issue in the text, output it as is. - - Translate to {to_lang}. - -# Samples being feed into ChatGPT to show an example conversation. -# In a [prompt, response] format, keyed by the target language name. -# -# Generally, samples should include some examples of translation preferences, and ideally -# some names of characters it's likely to encounter. -# -# If you'd like to disable this feature, just set this to an empty list. -chat_sample: - Simplified Chinese: # Tokens used in this example: 88 + 84 - - <|1|>恥ずかしい… 目立ちたくない… 私が消えたい… - <|2|>きみ… 大丈夫⁉ - <|3|>なんだこいつ 空気読めて ないのか…? - - <|1|>好尴尬…我不想引人注目…我想消失… - <|2|>你…没事吧⁉ - <|3|>这家伙怎么看不懂气氛的…? - -# Overwrite configs for a specific model. -# For now the list is: gpt3, gpt35, gpt4 -gpt35: - temperature: 0.3 -``` - -### Language Code Reference - -Used by the `--target-lang` or `-l` argument. - -```yaml -CHS: Chinese (Simplified) -CHT: Chinese (Traditional) -CSY: Czech -NLD: Dutch -ENG: English -FRA: French -DEU: German -HUN: Hungarian -ITA: Italian -JPN: Japanese -KOR: Korean -PLK: Polish -PTB: Portuguese (Brazil) -ROM: Romanian -RUS: Russian -ESP: Spanish -TRK: Turkish -UKR: Ukrainian -VIN: Vietnames -ARA: Arabic -SRP: Serbian -HRV: Croatian -THA: Thai -``` - - - -## Options +### Options ```text -h, --help show this help message and exit @@ -394,7 +383,7 @@ THA: Thai --overwrite Overwrite already translated images in batch mode. --skip-no-text Skip image without text (Will not be saved). --model-dir MODEL_DIR Model directory (by default ./models in project root) ---use-gpu Turn on/off gpu ( automatic del) +--use-gou Turn on/off gpu --use-gpu-limited Turn on/off gpu (excluding offline translator) --detector {default,ctd,craft,none} Text detector used for creating a text mask from an image, DO NOT use craft for manga, it's not designed @@ -495,183 +484,291 @@ THA: Thai -## Tips to improve translation quality -- Using `--detector ctd` can increase the amount of text lines detected -- Small resolutions can sometimes trip up the detector, which is not so good at picking up irregular text sizes. To circumvent this you can use an upscaler by specifying `--upscale-ratio 2` or any other value -- If the text being rendered is too small to read specify `--font-size-minimum 30` for instance or use the `--manga2eng` renderer that will try to adapt to detected textbubbles -- Specify a font with `--font-path fonts/anime_ace_3.ttf` for example - -## Docker +### Language Code Reference -Requirements: +Used by the `--target-lang` or `-l` argument. -- Docker (version 19.03+ required for CUDA / GPU acceleration) -- Docker Compose (Optional if you want to use files in the `demo/doc` folder) -- Nvidia Container Runtime (Optional if you want to use CUDA) +```yaml +CHS: Chinese (Simplified) +CHT: Chinese (Traditional) +CSY: Czech +NLD: Dutch +ENG: English +FRA: French +DEU: German +HUN: Hungarian +ITA: Italian +JPN: Japanese +KOR: Korean +PLK: Polish +PTB: Portuguese (Brazil) +ROM: Romanian +RUS: Russian +ESP: Spanish +TRK: Turkish +UKR: Ukrainian +VIN: Vietnames +ARA: Arabic +SRP: Serbian +HRV: Croatian +THA: Thai +``` -This project has docker support under `zyddnys/manga-image-translator:main` image. -This docker image contains all required dependencies / models for the project. -It should be noted that this image is fairly large (~ 15GB). +### Translators Reference -### Hosting the web server +| Name | API Key | Offline | Note | +|------------|---------|---------|--------------------------------------------------------| +| google | | | | +| youdao | ✔️ | | Requires `YOUDAO_APP_KEY` and `YOUDAO_SECRET_KEY` | +| baidu | ✔️ | | Requires `BAIDU_APP_ID` and `BAIDU_SECRET_KEY` | +| deepl | ✔️ | | Requires `DEEPL_AUTH_KEY` | +| caiyun | ✔️ | | Requires `CAIYUN_TOKEN` | +| gpt3 | ✔️ | | Implements text-davinci-003. Requires `OPENAI_API_KEY` | +| gpt3.5 | ✔️ | | Implements gpt-3.5-turbo. Requires `OPENAI_API_KEY` | +| gpt4 | ✔️ | | Implements gpt-4. Requires `OPENAI_API_KEY` | +| papago | | | | +| offline | | ✔️ | Chooses most suitable offline translator for language | +| sugoi | | ✔️ | Sugoi V4.0 Models | +| m2m100 | | ✔️ | Supports every language | +| m2m100_big | | ✔️ | | +| none | | ✔️ | Translate to empty texts | +| original | | ✔️ | Keep original texts | -The web server can be hosted using (For CPU) +- API Key: Whether the translator requires an API key to be set as environment variable. + For this you can create a .env file in the project root directory containing your api keys like so: -```bash -docker run -p 5003:5003 -v result:/app/result --ipc=host --rm zyddnys/manga-image-translator:main -l ENG --manga2eng -v --mode web --host=0.0.0.0 --port=5003 +```env +OPENAI_API_KEY=sk-xxxxxxx... +DEEPL_AUTH_KEY=xxxxxxxx... ``` -or +- Offline: Whether the translator can be used offline. -```bash -docker-compose -f demo/doc/docker-compose-web-with-cpu.yml up -``` +- Sugoi is created by mingshiba, please support him in https://www.patreon.com/mingshiba -depending on which you prefer. The web server should start on port [5003](http://localhost:5003) -and images should become in the `/result` folder. +### GPT Config Reference -### Using as CLI +Used by the `--gpt-config` argument. -To use docker with the CLI (I.e in batch mode) +```yaml +# The prompt being feed into GPT before the text to translate. +# Use {to_lang} to indicate where the target language name should be inserted. +# Note: ChatGPT models don't use this prompt. +prompt_template: > + Please help me to translate the following text from a manga to {to_lang} + (if it's already in {to_lang} or looks like gibberish you have to output it as it is instead):\n -```bash -docker run -v :/app/ -v -translated:/app/-translated --ipc=host --rm zyddnys/manga-image-translator:main --mode=batch -i=/app/ -``` +# What sampling temperature to use, between 0 and 2. +# Higher values like 0.8 will make the output more random, +# while lower values like 0.2 will make it more focused and deterministic. +temperature: 0.5 -**Note:** In the event you need to reference files on your host machine -you will need to mount the associated files as volumes into the `/app` folder inside the container. -Paths for the CLI will need to be the internal docker path `/app/...` instead of the paths on your host machine +# An alternative to sampling with temperature, called nucleus sampling, +# where the model considers the results of the tokens with top_p probability mass. +# So 0.1 means only the tokens comprising the top 10% probability mass are considered. +top_p: 1 -### Setting Translation Secrets +# The prompt being feed into ChatGPT before the text to translate. +# Use {to_lang} to indicate where the target language name should be inserted. +# Tokens used in this example: 57+ +chat_system_template: > + You are a professional translation engine, + please translate the story into a colloquial, + elegant and fluent content, + without referencing machine translations. + You must only translate the story, never interpret it. + If there is any issue in the text, output it as is. -Some translation services require API keys to function to set these pass them as env vars into the docker container. For example: + Translate to {to_lang}. -```bash -docker run --env="DEEPL_AUTH_KEY=xxx" --ipc=host --rm zyddnys/manga-image-translator:main +# Samples being feed into ChatGPT to show an example conversation. +# In a [prompt, response] format, keyed by the target language name. +# +# Generally, samples should include some examples of translation preferences, and ideally +# some names of characters it's likely to encounter. +# +# If you'd like to disable this feature, just set this to an empty list. +chat_sample: + Simplified Chinese: # Tokens used in this example: 88 + 84 + - <|1|>恥ずかしい… 目立ちたくない… 私が消えたい… + <|2|>きみ… 大丈夫⁉ + <|3|>なんだこいつ 空気読めて ないのか…? + - <|1|>好尴尬…我不想引人注目…我想消失… + <|2|>你…没事吧⁉ + <|3|>这家伙怎么看不懂气氛的…? + +# Overwrite configs for a specific model. +# For now the list is: gpt3, gpt35, gpt4 +gpt35: + temperature: 0.3 ``` -### Using with Nvidia GPU +### Using Gimp for rendering -> To use with a supported GPU please first read the initial `Docker` section. There are some special dependencies you will need to use +When setting output format to {`xcf`, `psd`, `pdf`} Gimp will be used to generate the file. -To run the container with the following flags set: +On Windows this assumes Gimp 2.x to be installed to `C:\Users\\AppData\Local\Programs\Gimp 2`. + +The resulting `.xcf` file contains the original image as the lowest layer and it has the inpainting as a separate layer. +The translated textboxes have their own layers with the original text as the layer name for easy access. + +Limitations: + +- Gimp will turn text layers to regular images when saving `.psd` files. +- Rotated text isn't handled well in Gimp. When editing a rotated textbox it'll also show a popup that it was modified + by an outside program. +- Font family is controlled separately, with the `--gimp-font` argument. + +### Api Documentation + +
+API V2 +
```bash -docker run ... --gpus=all ... zyddnys/manga-image-translator:main ... --use-gpu +# use `--mode api` to start a web server. +$ python -m manga_translator -v --mode api --use-gpu +# the api will be serving on http://127.0.0.1:5003 ``` -Or (For the web server + GPU) +Api is accepting json(post) and multipart. +
+Api endpoints are `/colorize_translate`, `/inpaint_translate`, `/translate`, `/get_text`. +
+Valid arguments for the api are: -```bash -docker-compose -f demo/doc/docker-compose-web-with-gpu.yml up +``` +// These are taken from args.py. For more info see README.md +detector: String +ocr: String +inpainter: String +upscaler: String +translator: String +target_language: String +upscale_ratio: Integer +translator_chain: String +selective_translation: String +attempts: Integer +detection_size: Integer // 1024 => 'S', 1536 => 'M', 2048 => 'L', 2560 => 'X' +text_threshold: Float +box_threshold: Float +unclip_ratio: Float +inpainting_size: Integer +det_rotate: Bool +det_auto_rotate: Bool +det_invert: Bool +det_gamma_correct: Bool +min_text_length: Integer +colorization_size: Integer +denoise_sigma: Integer +mask_dilation_offset: Integer +ignore_bubble: Integer +gpt_config: String +filter_text: String +overlay_type: String + +// These are api specific args +direction: String // {'auto', 'h', 'v'} +base64Images: String //Image in base64 format +image: Multipart // image upload from multipart +url: String // an url string ``` -### Building locally +
-To build the docker image locally you can run (You will require make on your machine) +Manual translation replaces machine translation with human translators. +Basic manual translation demo can be found at when using web mode. +
+API +
-```bash -make build-image +Two modes of translation service are provided by the demo: synchronous mode and asynchronous mode.\ +In synchronous mode your HTTP POST request will finish once the translation task is finished.\ +In asynchronous mode your HTTP POST request will respond with a `task_id` immediately, you can use this `task_id` to +poll for translation task state. + +#### Synchronous mode + +1. POST a form request with form data `file:` to +2. Wait for response +3. Use the resultant `task_id` to find translation result in `result/` directory, e.g. using Nginx to expose `result/` + +#### Asynchronous mode + +1. POST a form request with form data `file:` to +2. Acquire translation `task_id` +3. Poll for translation task state by posting JSON `{"taskid": }` to +4. Translation is finished when the resultant state is either `finished`, `error` or `error-lang` +5. Find translation result in `result/` directory, e.g. using Nginx to expose `result/` + +#### Manual translation + +POST a form request with form data `file:` to +and wait for response. + +You will obtain a JSON response like this: + +```json +{ + "task_id": "12c779c9431f954971cae720eb104499", + "status": "pending", + "trans_result": [ + { + "s": "☆上司来ちゃった……", + "t": "" + } + ] +} ``` -Then to test the built image run +Fill in translated texts: -```bash -make run-web-server +```json +{ + "task_id": "12c779c9431f954971cae720eb104499", + "status": "pending", + "trans_result": [ + { + "s": "☆上司来ちゃった……", + "t": "☆Boss is here..." + } + ] +} ``` +Post translated JSON to and wait for response.\ +Then you can find the translation result in `result/` directory, e.g. using Nginx to expose `result/`. + +
+ ## Next steps A list of what needs to be done next, you're welcome to contribute. 1. Use diffusion model based inpainting to achieve near perfect result, but this could be much slower. -2. ~~**IMPORTANT!!!HELP NEEDED!!!** The current text rendering engine is barely usable, we need your help to improve text rendering!~~ +2. ~~**IMPORTANT!!!HELP NEEDED!!!** The current text rendering engine is barely usable, we need your help to improve + text rendering!~~ 3. Text rendering area is determined by detected text lines, not speech bubbles.\ - This works for images without speech bubbles, but making it impossible to decide where to put translated English text. I have no idea how to solve this. -4. [Ryota et al.](https://arxiv.org/abs/2012.14271) proposed using multimodal machine translation, maybe we can add ViT features for building custom NMT models. + This works for images without speech bubbles, but making it impossible to decide where to put translated English + text. I have no idea how to solve this. +4. [Ryota et al.](https://arxiv.org/abs/2012.14271) proposed using multimodal machine translation, maybe we can add ViT + features for building custom NMT models. 5. Make this project works for video(rewrite code in C++ and use GPU/other hardware NN accelerator).\ Used for detecting hard subtitles in videos, generating ass file and remove them completely. 6. ~~Mask refinement based using non deep learning algorithms, I am currently testing out CRF based algorithm.~~ 7. ~~Angled text region merge is not currently supported~~ 8. Create pip repository -## Samples +## Support Us -Please note that the samples may not always be updated, they may not represent the current main branch version. +GPU server is not cheap, please consider to donate to us. - - - - - - - - - - - - - - - - - - - - - - - - - -
OriginalTranslated
- - 佐藤さんは知っていた - 猫麦 - -
- (Source @09ra_19ra) -
- - Output - -
- (Mask) -
- - Gris finds out she's of royal blood - VERTI - -
- (Source @VERTIGRIS_ART) -
- - Output - -
- --detector ctd - (Mask) -
- - 陰キャお嬢様の新学期🏫📔🌸 (#3) - ひづき夜宵🎀💜 - -
- (Source @hiduki_yayoi) -
- - Output - -
- --translator none - (Mask) -
- - 幼なじみの高校デビューの癖がすごい (#1) - 神吉李花☪️🐧 - -
- (Source @rikak) -
- - Output - -
- (Mask) -
+- Ko-fi: +- Patreon: +- 爱发电: + + ### Thanks To All Our Contributors : + + + + diff --git a/manga_translator/mask_refinement/__init__.py b/manga_translator/mask_refinement/__init__.py index 12bb7917..f608328f 100644 --- a/manga_translator/mask_refinement/__init__.py +++ b/manga_translator/mask_refinement/__init__.py @@ -8,7 +8,7 @@ async def dispatch(text_regions: List[TextBlock], raw_image: np.ndarray, raw_mask: np.ndarray, method: str = 'fit_text', dilation_offset: int = 0, ignore_bubble: int = 0, verbose: bool = False) -> np.ndarray: # Larger sized mask images will probably have crisper and thinner mask segments due to being able to fit the text pixels better - # so we dont want to size them down as much to not loose information + # so we dont want to size them down as much to not lose information scale_factor = max(min((raw_mask.shape[0] - raw_image.shape[0] / 3) / raw_mask.shape[0], 1), 0.5) img_resized = cv2.resize(raw_image, (int(raw_image.shape[1] * scale_factor), int(raw_image.shape[0] * scale_factor)), interpolation = cv2.INTER_LINEAR) diff --git a/manga_translator/mask_refinement/text_mask_utils.py b/manga_translator/mask_refinement/text_mask_utils.py index fc209beb..cc2efb2f 100644 --- a/manga_translator/mask_refinement/text_mask_utils.py +++ b/manga_translator/mask_refinement/text_mask_utils.py @@ -128,27 +128,30 @@ def complete_mask(img: np.ndarray, mask: np.ndarray, textlines: List[Quadrilater # print(textlines[tl_idx].pts, cc_pts, '->', overlapping_area, min(area1, area2), '=', overlapping_area / min(area1, area2), '|', polys[tl_idx].distance(cc_poly)) avg = np.argmax(ratio_mat[label]) - # print('overlap:', ratio_mat[label, avg], '<=', keep_threshold) + # print(avg, 'overlap:', ratio_mat[label, avg], '<=', keep_threshold) area2 = polys[avg].area if area1 >= area2: continue if ratio_mat[label, avg] <= keep_threshold: avg = np.argmin(dist_mat[label]) area2 = polys[avg].area - unit = min([textlines[avg].font_size, w1, h1]) + unit = max(min([textlines[avg].font_size, w1, h1]), 10) + # print("unit", unit, textlines[avg].font_size, w1, h1) # if area1 < 0.4 * w1 * h1: # # ccs is probably angled # unit /= 2 # if avg == 0: - # print('no intersect', area1, '>=', area2, dist_mat[label, avg], '>=', 0.5 * unit) + # print('no intersect', area1, '>=', area2, dist_mat[label, avg], '>=', 0.5 * unit) if dist_mat[label, avg] >= 0.5 * unit: + # print(dist_mat[label]) + # print('CONTINUE') continue textline_ccs[avg][y1:y1+h1, x1:x1+w1][labels[y1:y1+h1, x1:x1+w1] == label] = 255 # if avg == 0: - # print(avg) - # cv2.imshow('ccs', image_resize(textline_ccs[avg], height = 800)) - # cv2.waitKey(0) + # print(avg) + # cv2.imshow('ccs', image_resize(textline_ccs[avg], height = 800)) + # cv2.waitKey(0) textline_rects[avg, 0] = min(textline_rects[avg, 0], x1) textline_rects[avg, 1] = min(textline_rects[avg, 1], y1) textline_rects[avg, 2] = max(textline_rects[avg, 2], x1 + w1) @@ -168,8 +171,8 @@ def complete_mask(img: np.ndarray, mask: np.ndarray, textlines: List[Quadrilater x1, y1, w1, h1 = textline_rects[i] text_size = min(w1, h1, textlines[i].font_size) x1, y1, w1, h1 = extend_rect(x1, y1, w1, h1, img.shape[1], img.shape[0], int(text_size * 0.1)) - # TODO: Was text_size * 0.3 before. Need to think of better way to determine dilate_size. - dilate_size = max((int((text_size + dilation_offset) * 0.1) // 2) * 2 + 1, 3) + # TODO: Need to think of better way to determine dilate_size. + dilate_size = max((int((text_size + dilation_offset) * 0.3) // 2) * 2 + 1, 3) # print(textlines[i].font_size, min(w1, h1), dilate_size) kern = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (dilate_size, dilate_size)) cc_region = np.ascontiguousarray(cc[y1: y1 + h1, x1: x1 + w1]) diff --git a/manga_translator/rendering/gimp_render.py b/manga_translator/rendering/gimp_render.py index 9425b88e..bc61135b 100644 --- a/manga_translator/rendering/gimp_render.py +++ b/manga_translator/rendering/gimp_render.py @@ -80,6 +80,10 @@ def gimp_render(out_file, ctx: Context): else: ctx.text_regions = [] + filtered_text_regions = [ + text_region for text_region in ctx.text_regions if text_region.translation != "" + ] + text_init = "\n".join( [ text_init_template.format( @@ -90,7 +94,7 @@ def gimp_render(out_file, ctx: Context): + (" Bold" if text_region.bold else "") + (" Italic" if text_region.italic else ""), ) - for n, text_region in enumerate(ctx.text_regions) + for n, text_region in enumerate(filtered_text_regions) ] ) @@ -115,7 +119,7 @@ def gimp_render(out_file, ctx: Context): letter_spacing=text_region.letter_spacing, base_direction=direction_to_base_direction[text_region.direction], ) - for n, text_region in enumerate(ctx.text_regions) + for n, text_region in enumerate(filtered_text_regions) ] ) @@ -149,6 +153,11 @@ def gimp_console_executable(): if platform.system() == "Windows": gimp_dir = os.getenv("LOCALAPPDATA") + "\\Programs\\GIMP 2\\bin\\" executables = glob.glob(gimp_dir + "gimp-console-2.*.exe") + if len(executables) > 0: + return executables[0] + # may be in program files + gimp_dir = os.getenv("ProgramFiles") + "\\GIMP 2\\bin\\" + executables = glob.glob(gimp_dir + "gimp-console-2.*.exe") if len(executables) == 0: print("error: gimp not found in directory:", gimp_dir) return diff --git a/manga_translator/utils/textblock.py b/manga_translator/utils/textblock.py index c5a0e0fc..cc5f2e41 100644 --- a/manga_translator/utils/textblock.py +++ b/manga_translator/utils/textblock.py @@ -85,7 +85,7 @@ def __init__(self, lines: List, if first_cjk or second_cjk : self.text += txt else : - self.texts += ' ' + txt + self.text += ' ' + txt self.prob = prob self.translation = translation