Merge branch 'docker-laser'

Paulooh007 · Nov 21, 2023 · 43d8ae6 · 43d8ae6
2 parents a7905b9 + 7db9310
commit 43d8ae6
Show file tree

Hide file tree

Showing 4 changed files with 109 additions and 123 deletions.
diff --git a/docker/Dockerfile b/docker/Dockerfile
@@ -2,65 +2,34 @@ FROM continuumio/miniconda3
 
 MAINTAINER Gilles Bodart <[email protected]>
 
-RUN conda create -n env python=3.6
-RUN echo "source activate env" > ~/.bashrc
-ENV PATH /opt/conda/envs/env/bin:$PATH
+# Define the argument for language
+ARG lang
 
-RUN apt-get -qq -y update
-RUN apt-get -qq -y upgrade
-RUN apt-get -qq -y install \
-        gcc \
-        g++ \
-        wget \
-        curl \
-        git \
-        make \
-        unzip \
-        sudo \
-        vim
+# Install build-essential (compiler and development tools)
+RUN apt-get update && \
+    apt-get install -y build-essential && \
+    rm -rf /var/lib/apt/lists/*
 
-# Use C.UTF-8 locale to avoid issues with ASCII encoding
-ENV LC_ALL=C.UTF-8
-ENV LANG=C.UTF-8
+RUN conda create -n env python=3.8
+RUN echo "source activate env" > ~/.bashrc
+ENV PATH /opt/conda/envs/env/bin:$PATH
 
 # Set the working directory to /app
 WORKDIR /app
 
-COPY ./requirements.txt /app/requirements.txt
-
-# Install any needed packages specified in requirements.txt
-RUN pip install --trusted-host pypi.python.org -r requirements.txt --verbose
-
-
-# Download LASER from FB
-RUN git clone https://github.com/facebookresearch/LASER.git
-
-ENV LASER /app/LASER
-WORKDIR $LASER
-
-RUN bash ./install_models.sh
-
-
-#Installing FAISS
-
-RUN conda install --name env -c pytorch faiss-cpu -y
-
-RUN bash ./install_external_tools.sh
-
-COPY ./decode.py $LASER/tasks/embed/decode.py
-
-
-# Make port 80 available to the world outside this container
-WORKDIR /app
+# Copy the local laser-encoders repository
+COPY laser_encoders /app/laser_encoders
+COPY pyproject.toml /app/
 
-RUN echo "Hello World" > test.txt
+RUN pip install --upgrade pip
+RUN pip install . --verbose
 
-RUN $LASER/tasks/embed/embed.sh test.txt en test_embed.raw
-RUN python $LASER/tasks/embed/decode.py test_embed.raw
+# Download language models based on the specified language
+RUN python -m laser_encoders.download_models --lang=$lang
 
-#Open the port 80
+# Open the port 80
 EXPOSE 80
 
-COPY ./app.py /app/app.py
+COPY docker/app.py /app/app.py
 
 CMD ["/bin/bash"]
diff --git a/docker/README.md b/docker/README.md
@@ -1,19 +1,62 @@
-## Docker
+## LASER Docker Image
 
-An image docker has been created to help you with the settings of an environment here are the step to follow :
+This image provides a convenient way to run LASER in a Docker container.
+To build the image, run the following command from the root of the LASER directory:
 
-* Open a command prompt on the root of your LASER project
-* Execute the command `docker build --tag=laser docker`
-* Once the image is built run `docker run -it laser`
+```
+docker build --tag=laser docker
+```
+Once the image is built, you can run it with the following command:
 
-A REST server on top of the embed task is under developement,
-to run it you'll have to expose a local port [CHANGEME_LOCAL_PORT] by executing the next line instead of the last command. It'll overinde the command line entrypoint of your docker container.
+```
+docker run -it laser
+```
+**Note:** If you want to expose a local port to the REST server on top of the embed task, you can do so by executing the following command instead of the last command:
 
-* `docker run -p [CHANGEME_LOCAL_PORT]:80 -it laser python app.py`
+```
+docker run -it -p [CHANGEME_LOCAL_PORT]:80 laser python app.py
+```
+This will override the command line entrypoint of the Docker container.
+
+Example:
+
+```
+docker run -it -p 8081:80 laser python app.py
+```
 
 This Flask server will serve a REST Api that can be use by calling your server with this URL :
 
-*   http://127.0.0.1:[CHANGEME_LOCAL_PORT]/vectorize?q=[YOUR_SENTENCE_URL_ENCODED]&lang=[LANGUAGE]
+```
+http://127.0.0.1:[CHANGEME_LOCAL_PORT]/vectorize?q=[YOUR_SENTENCE_URL_ENCODED]&lang=[LANGUAGE]
+```
+
+Example:
+
+```
+http://127.0.0.1:8081/vectorize?q=ki%20lo%20'orukọ%20ẹ&lang=yor
+```
+
+Sample response:
+```
+{
+    "content": "ki lo 'orukọ ẹ",
+    "embedding": [
+        [
+            -0.10241681337356567,
+            0.11120740324258804,
+            -0.26641348004341125,
+            -0.055699944496154785,
+            ....
+            ....
+            ....
+            -0.034048307687044144,
+            0.11005636304616928,
+            -0.3238321840763092,
+            -0.060631975531578064,
+            -0.19269055128097534,
+        ]
+}
+```
 
 Here is an example of how you can send requests to it with python:
 

diff --git a/docker/app.py b/docker/app.py
@@ -1,78 +1,44 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
-from flask import Flask, request, jsonify
 import os
 import socket
-import tempfile
-from pathlib import Path
+
 import numpy as np
-from LASER.source.lib.text_processing import Token, BPEfastApply
-from LASER.source.embed import *
+from flask import Flask, jsonify, request
+from laser_encoders import LaserEncoderPipeline
 
 app = Flask(__name__)
-app.config['JSON_AS_ASCII'] = False
 
 
 @app.route("/")
 def root():
     print("/")
-    html = "<h3>Hello {name}!</h3>" \
-           "<b>Hostname:</b> {hostname}<br/>"
+    html = "<h3>Hello {name}!</h3>" "<b>Hostname:</b> {hostname}<br/>"
     return html.format(name=os.getenv("LASER", "world"), hostname=socket.gethostname())
 
 
-@app.route("/vectorize")
+@app.route("/vectorize", methods=["GET"])
 def vectorize():
-    content = request.args.get('q')
-    lang = request.args.get('lang')
-    embedding = ''
-    if lang is None or not lang:
-        lang = "en"
-    # encoder
-    model_dir = Path(__file__).parent / "LASER" / "models"
-    encoder_path = model_dir / "bilstm.93langs.2018-12-26.pt"
-    bpe_codes_path = model_dir / "93langs.fcodes"
-    print(f' - Encoder: loading {encoder_path}')
-    encoder = SentenceEncoder(encoder_path,
-                              max_sentences=None,
-                              max_tokens=12000,
-                              sort_kind='mergesort',
-                              cpu=True)
-    with tempfile.TemporaryDirectory() as tmp:
-        tmpdir = Path(tmp)
-        ifname = tmpdir / "content.txt"
-        bpe_fname = tmpdir / 'bpe'
-        bpe_oname = tmpdir / 'out.raw'
-        with ifname.open("w") as f:
-            f.write(content)
-        if lang != '--':
-            tok_fname = tmpdir / "tok"
-            Token(str(ifname),
-                  str(tok_fname),
-                  lang=lang,
-                  romanize=True if lang == 'el' else False,
-                  lower_case=True,
-                  gzip=False,
-                  verbose=True,
-                  over_write=False)
-            ifname = tok_fname
-        BPEfastApply(str(ifname),
-                     str(bpe_fname),
-                     str(bpe_codes_path),
-                     verbose=True, over_write=False)
-        ifname = bpe_fname
-        EncodeFile(encoder,
-                   str(ifname),
-                   str(bpe_oname),
-                   verbose=True,
-                   over_write=False,
-                   buffer_size=10000)
-        dim = 1024
-        X = np.fromfile(str(bpe_oname), dtype=np.float32, count=-1)
-        X.resize(X.shape[0] // dim, dim)
-        embedding = X
-    body = {'content': content, 'embedding': embedding.tolist()}
-    return jsonify(body)
+    content = request.args.get("q")
+    lang = request.args.get("lang", "en")  # Default to English if 'lang' is not provided
+
+    if content is None:
+        return jsonify({"error": "Missing input content"}), 400
+
+    try:
+        encoder = LaserEncoderPipeline(lang=lang)
+        embeddings = encoder.encode_sentences([content])
+        embeddings_list = embeddings.tolist()
+        body = {"content": content, "embedding": embeddings_list}
+        return jsonify(body), 200
+
+    except ValueError as e:
+        # Check if the exception is due to an unsupported language
+        if "unsupported language" in str(e).lower():
+            return jsonify({"error": f"Language '{lang}' is not supported."}), 400
+        else:
+            return jsonify({"error": str(e)}), 400
+
 
 if __name__ == "__main__":
-    app.run(debug=True, port=80, host='0.0.0.0')
+    app.run(debug=True, port=80, host="0.0.0.0")
diff --git a/docker/requirements.txt b/docker/requirements.txt
@@ -1,6 +1,14 @@
-Flask
-scipy
-numpy
-Cython
+fairseq==0.12.2
+numpy==1.25.0
+pytest==7.4.0
+Requests==2.31.0
+sacremoses==0.0.53
+sentencepiece==0.1.99
+tqdm==4.65.0
+Flask==2.3.3
+
+--extra-index-url https://download.pytorch.org/whl/cpu
 torch
-transliterate
+
+--extra-index-url https://test.pypi.org/simple/
+laser-encoders==0.0.3