forked from facebookresearch/LASER
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
4 changed files
with
109 additions
and
123 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,65 +2,34 @@ FROM continuumio/miniconda3 | |
|
||
MAINTAINER Gilles Bodart <[email protected]> | ||
|
||
RUN conda create -n env python=3.6 | ||
RUN echo "source activate env" > ~/.bashrc | ||
ENV PATH /opt/conda/envs/env/bin:$PATH | ||
# Define the argument for language | ||
ARG lang | ||
|
||
RUN apt-get -qq -y update | ||
RUN apt-get -qq -y upgrade | ||
RUN apt-get -qq -y install \ | ||
gcc \ | ||
g++ \ | ||
wget \ | ||
curl \ | ||
git \ | ||
make \ | ||
unzip \ | ||
sudo \ | ||
vim | ||
# Install build-essential (compiler and development tools) | ||
RUN apt-get update && \ | ||
apt-get install -y build-essential && \ | ||
rm -rf /var/lib/apt/lists/* | ||
|
||
# Use C.UTF-8 locale to avoid issues with ASCII encoding | ||
ENV LC_ALL=C.UTF-8 | ||
ENV LANG=C.UTF-8 | ||
RUN conda create -n env python=3.8 | ||
RUN echo "source activate env" > ~/.bashrc | ||
ENV PATH /opt/conda/envs/env/bin:$PATH | ||
|
||
# Set the working directory to /app | ||
WORKDIR /app | ||
|
||
COPY ./requirements.txt /app/requirements.txt | ||
|
||
# Install any needed packages specified in requirements.txt | ||
RUN pip install --trusted-host pypi.python.org -r requirements.txt --verbose | ||
|
||
|
||
# Download LASER from FB | ||
RUN git clone https://github.com/facebookresearch/LASER.git | ||
|
||
ENV LASER /app/LASER | ||
WORKDIR $LASER | ||
|
||
RUN bash ./install_models.sh | ||
|
||
|
||
#Installing FAISS | ||
|
||
RUN conda install --name env -c pytorch faiss-cpu -y | ||
|
||
RUN bash ./install_external_tools.sh | ||
|
||
COPY ./decode.py $LASER/tasks/embed/decode.py | ||
|
||
|
||
# Make port 80 available to the world outside this container | ||
WORKDIR /app | ||
# Copy the local laser-encoders repository | ||
COPY laser_encoders /app/laser_encoders | ||
COPY pyproject.toml /app/ | ||
|
||
RUN echo "Hello World" > test.txt | ||
RUN pip install --upgrade pip | ||
RUN pip install . --verbose | ||
|
||
RUN $LASER/tasks/embed/embed.sh test.txt en test_embed.raw | ||
RUN python $LASER/tasks/embed/decode.py test_embed.raw | ||
# Download language models based on the specified language | ||
RUN python -m laser_encoders.download_models --lang=$lang | ||
|
||
#Open the port 80 | ||
# Open the port 80 | ||
EXPOSE 80 | ||
|
||
COPY ./app.py /app/app.py | ||
COPY docker/app.py /app/app.py | ||
|
||
CMD ["/bin/bash"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,78 +1,44 @@ | ||
#!/usr/bin/env python | ||
# -*- coding: utf-8 -*- | ||
from flask import Flask, request, jsonify | ||
import os | ||
import socket | ||
import tempfile | ||
from pathlib import Path | ||
|
||
import numpy as np | ||
from LASER.source.lib.text_processing import Token, BPEfastApply | ||
from LASER.source.embed import * | ||
from flask import Flask, jsonify, request | ||
from laser_encoders import LaserEncoderPipeline | ||
|
||
app = Flask(__name__) | ||
app.config['JSON_AS_ASCII'] = False | ||
|
||
|
||
@app.route("/") | ||
def root(): | ||
print("/") | ||
html = "<h3>Hello {name}!</h3>" \ | ||
"<b>Hostname:</b> {hostname}<br/>" | ||
html = "<h3>Hello {name}!</h3>" "<b>Hostname:</b> {hostname}<br/>" | ||
return html.format(name=os.getenv("LASER", "world"), hostname=socket.gethostname()) | ||
|
||
|
||
@app.route("/vectorize") | ||
@app.route("/vectorize", methods=["GET"]) | ||
def vectorize(): | ||
content = request.args.get('q') | ||
lang = request.args.get('lang') | ||
embedding = '' | ||
if lang is None or not lang: | ||
lang = "en" | ||
# encoder | ||
model_dir = Path(__file__).parent / "LASER" / "models" | ||
encoder_path = model_dir / "bilstm.93langs.2018-12-26.pt" | ||
bpe_codes_path = model_dir / "93langs.fcodes" | ||
print(f' - Encoder: loading {encoder_path}') | ||
encoder = SentenceEncoder(encoder_path, | ||
max_sentences=None, | ||
max_tokens=12000, | ||
sort_kind='mergesort', | ||
cpu=True) | ||
with tempfile.TemporaryDirectory() as tmp: | ||
tmpdir = Path(tmp) | ||
ifname = tmpdir / "content.txt" | ||
bpe_fname = tmpdir / 'bpe' | ||
bpe_oname = tmpdir / 'out.raw' | ||
with ifname.open("w") as f: | ||
f.write(content) | ||
if lang != '--': | ||
tok_fname = tmpdir / "tok" | ||
Token(str(ifname), | ||
str(tok_fname), | ||
lang=lang, | ||
romanize=True if lang == 'el' else False, | ||
lower_case=True, | ||
gzip=False, | ||
verbose=True, | ||
over_write=False) | ||
ifname = tok_fname | ||
BPEfastApply(str(ifname), | ||
str(bpe_fname), | ||
str(bpe_codes_path), | ||
verbose=True, over_write=False) | ||
ifname = bpe_fname | ||
EncodeFile(encoder, | ||
str(ifname), | ||
str(bpe_oname), | ||
verbose=True, | ||
over_write=False, | ||
buffer_size=10000) | ||
dim = 1024 | ||
X = np.fromfile(str(bpe_oname), dtype=np.float32, count=-1) | ||
X.resize(X.shape[0] // dim, dim) | ||
embedding = X | ||
body = {'content': content, 'embedding': embedding.tolist()} | ||
return jsonify(body) | ||
content = request.args.get("q") | ||
lang = request.args.get("lang", "en") # Default to English if 'lang' is not provided | ||
|
||
if content is None: | ||
return jsonify({"error": "Missing input content"}), 400 | ||
|
||
try: | ||
encoder = LaserEncoderPipeline(lang=lang) | ||
embeddings = encoder.encode_sentences([content]) | ||
embeddings_list = embeddings.tolist() | ||
body = {"content": content, "embedding": embeddings_list} | ||
return jsonify(body), 200 | ||
|
||
except ValueError as e: | ||
# Check if the exception is due to an unsupported language | ||
if "unsupported language" in str(e).lower(): | ||
return jsonify({"error": f"Language '{lang}' is not supported."}), 400 | ||
else: | ||
return jsonify({"error": str(e)}), 400 | ||
|
||
|
||
if __name__ == "__main__": | ||
app.run(debug=True, port=80, host='0.0.0.0') | ||
app.run(debug=True, port=80, host="0.0.0.0") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,14 @@ | ||
Flask | ||
scipy | ||
numpy | ||
Cython | ||
fairseq==0.12.2 | ||
numpy==1.25.0 | ||
pytest==7.4.0 | ||
Requests==2.31.0 | ||
sacremoses==0.0.53 | ||
sentencepiece==0.1.99 | ||
tqdm==4.65.0 | ||
Flask==2.3.3 | ||
|
||
--extra-index-url https://download.pytorch.org/whl/cpu | ||
torch | ||
transliterate | ||
|
||
--extra-index-url https://test.pypi.org/simple/ | ||
laser-encoders==0.0.3 |