From f51fb0a51efe8218bdc5f4fda7c6fc4e462e0c63 Mon Sep 17 00:00:00 2001 From: paul Date: Tue, 28 Nov 2023 13:23:11 +0100 Subject: [PATCH] Improve caching for laser2 languages --- docker/app.py | 29 +++++++++++++++++++++++++---- 1 file changed, 25 insertions(+), 4 deletions(-) diff --git a/docker/app.py b/docker/app.py index b9d79a08..1ae7e938 100644 --- a/docker/app.py +++ b/docker/app.py @@ -4,32 +4,52 @@ import socket from flask import Flask, jsonify, request + from laser_encoders import LaserEncoderPipeline +from laser_encoders.language_list import LASER2_LANGUAGES_LIST app = Flask(__name__) # Global cache for encoders encoder_cache = {} + @app.route("/") def root(): print("/") html = "

Hello {name}!

" "Hostname: {hostname}
" return html.format(name=os.getenv("LASER", "world"), hostname=socket.gethostname()) + @app.route("/vectorize", methods=["GET"]) def vectorize(): content = request.args.get("q") - lang = request.args.get("lang", "eng") # Default to English if 'lang' is not provided + lang = request.args.get( + "lang", "eng" + ) # Default to English if 'lang' is not provided if content is None: return jsonify({"error": "Missing input content"}), 400 try: - # Use cached encoder if available, else create a new one - if lang not in encoder_cache: + lang_prefix = lang[:3] + + # Find if lang uses laser2 encoder and is already cached + cached_laser2_language = next( + ( + l + for l in LASER2_LANGUAGES_LIST + if l.startswith(lang_prefix) and l in encoder_cache + ), + None, + ) + + if cached_laser2_language: + encoder = encoder_cache[cached_laser2_language] + else: + # Use the provided language code as is for the new encoder encoder_cache[lang] = LaserEncoderPipeline(lang=lang) - encoder = encoder_cache[lang] + encoder = encoder_cache[lang] embeddings = encoder.encode_sentences([content]) embeddings_list = embeddings.tolist() @@ -43,5 +63,6 @@ def vectorize(): else: return jsonify({"error": str(e)}), 400 + if __name__ == "__main__": app.run(debug=True, port=80, host="0.0.0.0")