From 8be7c91f295a996fe2d6362c7edbe7b1156490d8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pedro=20Guimar=C3=A3es?= Date: Wed, 8 Nov 2023 14:38:23 +0000 Subject: [PATCH] =?UTF-8?q?Substitui=C3=A7=C3=A3o=20do=20elasticsearch=20p?= =?UTF-8?q?elo=20opensearch.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Alterações feitas para refletir todas as alterações de ES para OS. Falta apenas o Makefile. Makefile adaptado para Opensearch Correções diversas. --- .gitignore | 2 + Makefile | 49 ++++--- api/api.py | 4 +- config/config.py | 7 +- config/sample.env | 34 +++-- docs/CONTRIBUTING.md | 6 +- gazettes/gazette_access.py | 3 +- index/__init__.py | 2 +- index/{elasticsearch.py => opensearch.py} | 24 +-- main/__main__.py | 2 +- requirements.txt | 4 +- scripts/load_fake_gazettes.py | 36 ++--- tests/test_config.py | 28 ++-- ...st_elasticsearch.py => test_opensearch.py} | 138 +++++++++--------- themed_excerpts/themed_excerpt_access.py | 2 +- 15 files changed, 173 insertions(+), 168 deletions(-) rename index/{elasticsearch.py => opensearch.py} (89%) rename tests/{test_elasticsearch.py => test_opensearch.py} (93%) diff --git a/.gitignore b/.gitignore index 40a1660..1314726 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,5 @@ .coverage __pycache__ config/current.env +censo.csv +themes_config.json diff --git a/Makefile b/Makefile index 159fb05..4fb9404 100644 --- a/Makefile +++ b/Makefile @@ -1,17 +1,17 @@ -IMAGE_NAMESPACE ?= serenata +IMAGE_NAMESPACE ?= okfn-brasil IMAGE_NAME ?= querido-diario-api IMAGE_TAG ?= latest IMAGE_FORMAT ?= docker -# Elasticsearch ports -# Variables used to connect the app to the ElasticSearch +# Opensearch ports +# Variables used to connect the app to the OpenSearch QUERIDO_DIARIO_DATABASE_CSV ?= censo.csv -ELASTICSEARCH_PORT1 ?= 9200 -ELASTICSEARCH_PORT2 ?= 9300 +OPENSEARCH_PORT1 ?= 9200 +OPENSEARCH_PORT2 ?= 9300 # Containers data POD_NAME ?= querido-diario-api DATABASE_CONTAINER_NAME ?= $(POD_NAME)-db -ELASTICSEARCH_CONTAINER_NAME ?= $(POD_NAME)-elasticsearch +OPENSEARCH_CONTAINER_NAME ?= $(POD_NAME)-opensearch # Database info user to run the tests POSTGRES_USER ?= companies POSTGRES_PASSWORD ?= companies @@ -20,7 +20,7 @@ POSTGRES_HOST ?= localhost POSTGRES_PORT ?= 5432 POSTGRES_IMAGE ?= docker.io/postgres:10 DATABASE_RESTORE_FILE ?= contrib/data/queridodiariodb.tar -# Run integration tests. Run local elasticsearch to validate the iteration +# Run integration tests. Run local opensearch to validate the iteration RUN_INTEGRATION_TESTS ?= 0 API_PORT := 8080 @@ -72,20 +72,20 @@ destroy-pod: podman pod rm --force --ignore $(POD_NAME) create-pod: destroy-pod - cp --no-clobber config/sample.env config/current.env + -cp --no-clobber config/sample.env config/current.env podman pod create --publish $(API_PORT):$(API_PORT) \ - --publish $(ELASTICSEARCH_PORT1):$(ELASTICSEARCH_PORT1) \ - --publish $(ELASTICSEARCH_PORT2):$(ELASTICSEARCH_PORT2) \ --publish $(POSTGRES_PORT):$(POSTGRES_PORT) \ + --publish $(OPENSEARCH_PORT1):$(OPENSEARCH_PORT1) \ + --publish $(OPENSEARCH_PORT2):$(OPENSEARCH_PORT2) \ --name $(POD_NAME) set-test-variables: $(eval POD_NAME=test-$(POD_NAME)) $(eval DATABASE_CONTAINER_NAME=test-$(DATABASE_CONTAINER_NAME)) $(eval API_PORT=8088) - $(eval ELASTICSEARCH_PORT1=9201) - $(eval ELASTICSEARCH_PORT2=9301) - $(eval ELASTICSEARCH_CONTAINER_NAME=test-$(ELASTICSEARCH_CONTAINER_NAME)) + $(eval OPENSEARCH_PORT1=9201) + $(eval OPENSEARCH_PORT2=9301) + $(eval OPENSEARCH_CONTAINER_NAME=test-$(OPENSEARCH_CONTAINER_NAME)) $(eval QUERIDO_DIARIO_DATABASE_CSV="") set-integration-test-variables: set-test-variables @@ -99,14 +99,14 @@ retest: set-test-variables black $(call run-command, python -m unittest discover tests) .PHONY: test-all -test-all: set-integration-test-variables create-pod elasticsearch database retest +test-all: set-integration-test-variables create-pod opensearch database retest .PHONY: test-shell test-shell: set-test-variables $(call run-command, bash) .PHONY: coverage -coverage: set-test-variables create-pod elasticsearch database +coverage: set-test-variables create-pod opensearch database $(call run-command, coverage erase) $(call run-command, coverage run -m unittest tests) $(call run-command, coverage report -m) @@ -119,7 +119,7 @@ shell: bash .PHONY: run -run: create-pod elasticsearch database load-data rerun +run: create-pod opensearch database rerun .PHONY:load-data load-data: @@ -127,7 +127,7 @@ load-data: .PHONY: rerun -rerun: wait-elasticsearch wait-database +rerun: wait-opensearch wait-database $(call run-command, python main) .PHONY: runshell @@ -135,19 +135,20 @@ runshell: $(call run-command, bash) -elasticsearch: stop-elasticsearch start-elasticsearch wait-elasticsearch +opensearch: stop-opensearch start-opensearch wait-opensearch -start-elasticsearch: +start-opensearch: podman run -d --rm -ti \ - --name $(ELASTICSEARCH_CONTAINER_NAME) \ + --name $(OPENSEARCH_CONTAINER_NAME) \ --pod $(POD_NAME) \ --env discovery.type=single-node \ - elasticsearch:7.9.1 + --env plugins.security.ssl.http.enabled=false \ + opensearchproject/opensearch:2.9.0 -stop-elasticsearch: - podman rm --force --ignore $(ELASTICSEARCH_CONTAINER_NAME) +stop-opensearch: + podman rm --force --ignore $(OPENSEARCH_CONTAINER_NAME) -wait-elasticsearch: +wait-opensearch: $(call wait-for, localhost:9200) .PHONY: stop-database diff --git a/api/api.py b/api/api.py index 0449ed0..1bad5f8 100644 --- a/api/api.py +++ b/api/api.py @@ -242,7 +242,7 @@ async def get_gazettes( ), querystring: str = Query( "", - description='Search in gazettes using ElasticSearch\'s "simple query string syntax" (an empty field returns no excerpts, only the results metadata).', + description='Search in gazettes using OpenSearch\'s "simple query string syntax" (an empty field returns no excerpts, only the results metadata).', ), excerpt_size: int = Query( 500, @@ -340,7 +340,7 @@ async def get_themed_excerpts( ), querystring: str = Query( "", - description='Search in excerpts using ElasticSearch\'s "simple query string syntax".', + description='Search in excerpts using OpenSearch\'s "simple query string syntax".', ), pre_tags: List[str] = Query( [""], diff --git a/config/config.py b/config/config.py index 9edc17d..1f796cd 100644 --- a/config/config.py +++ b/config/config.py @@ -8,7 +8,7 @@ class Configuration: def __init__(self): - self.host = os.environ.get("QUERIDO_DIARIO_ELASTICSEARCH_HOST", "") + self.host = os.environ.get("QUERIDO_DIARIO_OPENSEARCH_HOST", "") self.root_path = os.environ.get("QUERIDO_DIARIO_API_ROOT_PATH", "") self.url_prefix = os.environ.get("QUERIDO_DIARIO_URL_PREFIX", "") self.cors_allow_origins = Configuration._load_list( @@ -45,7 +45,7 @@ def __init__(self): "QUERIDO_DIARIO_SUGGESTION_MAILJET_CUSTOM_ID", "" ) self.city_database_file = os.environ["CITY_DATABASE_CSV"] - self.gazette_index = os.environ.get("GAZETTE_ELASTICSEARCH_INDEX", "") + self.gazette_index = os.environ.get("GAZETTE_OPENSEARCH_INDEX", "") self.gazette_content_field = os.environ.get("GAZETTE_CONTENT_FIELD", "") self.gazette_content_exact_field_suffix = os.environ.get( "GAZETTE_CONTENT_EXACT_FIELD_SUFFIX", "" @@ -96,7 +96,8 @@ def __init__(self): self.companies_database_user = os.environ.get("POSTGRES_USER", "") self.companies_database_pass = os.environ.get("POSTGRES_PASSWORD", "") self.companies_database_port = os.environ.get("POSTGRES_PORT", "") - + self.opensearch_user = os.environ.get("QUERIDO_DIARIO_OPENSEARCH_USER", "") + self.opensearch_pswd = os.environ.get("QUERIDO_DIARIO_OPENSEARCH_PASSWORD", "") @classmethod def _load_list(cls, key, default=[]): value = os.environ.get(key, default) diff --git a/config/sample.env b/config/sample.env index f9e0c8f..8b4a0e2 100644 --- a/config/sample.env +++ b/config/sample.env @@ -1,4 +1,6 @@ -QUERIDO_DIARIO_ELASTICSEARCH_HOST=localhost +QUERIDO_DIARIO_OPENSEARCH_HOST=localhost +QUERIDO_DIARIO_OPENSEARCH_USER=admin +QUERIDO_DIARIO_OPENSEARCH_PASSWORD=admin QUERIDO_DIARIO_SUGGESTION_MAILJET_REST_API_KEY=mailjet.com QUERIDO_DIARIO_SUGGESTION_MAILJET_REST_API_SECRET=mailjet.com QUERIDO_DIARIO_SUGGESTION_SENDER_NAME=Sender Name @@ -12,21 +14,21 @@ POSTGRES_DB=companiesdb POSTGRES_HOST=localhost POSTGRES_PORT=5432 CITY_DATABASE_CSV=censo.csv -GAZETTE_ELASTICSEARCH_INDEX=gazettes -GAZETTE_CONTENT_FIELD=content_field -GAZETTE_CONTENT_EXACT_FIELD_SUFFIX=.field_suffix -GAZETTE_PUBLICATION_DATE_FIELD=publication_date_field -GAZETTE_SCRAPED_AT_FIELD=scraped_at_field -GAZETTE_TERRITORY_ID_FIELD=territory_id_field +GAZETTE_OPENSEARCH_INDEX=querido-diario +GAZETTE_CONTENT_FIELD=source_text +GAZETTE_CONTENT_EXACT_FIELD_SUFFIX=.exact +GAZETTE_PUBLICATION_DATE_FIELD=date +GAZETTE_SCRAPED_AT_FIELD=scraped_at +GAZETTE_TERRITORY_ID_FIELD=territory_id THEMES_DATABASE_JSON=themes_config.json -THEMED_EXCERPT_CONTENT_FIELD=content_field -THEMED_EXCERPT_CONTENT_EXACT_FIELD_SUFFIX=.field_suffix -THEMED_EXCERPT_PUBLICATION_DATE_FIELD=publication_date_field -THEMED_EXCERPT_SCRAPED_AT_FIELD=scraped_at_field -THEMED_EXCERPT_TERRITORY_ID_FIELD=territory_id_field -THEMED_EXCERPT_ENTITIES_FIELD=entities_field -THEMED_EXCERPT_SUBTHEMES_FIELD=subthemes_field -THEMED_EXCERPT_EMBEDDING_SCORE_FIELD=embedding_score_field -THEMED_EXCERPT_TFIDF_SCORE_FIELD=tfidf_score_field +THEMED_EXCERPT_CONTENT_FIELD=excerpt +THEMED_EXCERPT_CONTENT_EXACT_FIELD_SUFFIX=.exact +THEMED_EXCERPT_PUBLICATION_DATE_FIELD=source_date +THEMED_EXCERPT_SCRAPED_AT_FIELD=source_scraped_at +THEMED_EXCERPT_TERRITORY_ID_FIELD=source_territory_id +THEMED_EXCERPT_ENTITIES_FIELD=excerpt_entities +THEMED_EXCERPT_SUBTHEMES_FIELD=excerpt_subthemes +THEMED_EXCERPT_EMBEDDING_SCORE_FIELD=excerpt_embedding_score +THEMED_EXCERPT_TFIDF_SCORE_FIELD=excerpt_tfidf_score THEMED_EXCERPT_FRAGMENT_SIZE=10000 THEMED_EXCERPT_NUMBER_OF_FRAGMENTS=1 diff --git a/docs/CONTRIBUTING.md b/docs/CONTRIBUTING.md index f3f8aee..938a99b 100644 --- a/docs/CONTRIBUTING.md +++ b/docs/CONTRIBUTING.md @@ -24,10 +24,10 @@ Já leu? Então vamos às informações específicas deste repositório: | Serviço | [`themed_excerpts`](/themed_excerpts) | Consultas ao índices de busca textual temáticos do QD. | index | | Módulo | [`database`](/database) | Classe de interação com bancos de dados Postgres. | Postgres | | Módulo | [`config`](/config) | Configuração de variáveis de ambiente. | | -| Módulo | [`index`](/index) | Classe de interação com índices Elasticsearch. | Elasticsearch | +| Módulo | [`index`](/index) | Classe de interação com índices Opensearch. | Opensearch | | Recurso | Postgres | Banco de dados de CNPJ. Contém informações sobre empresas e sócios cadastrados na Receita Federal. | | | Recurso | Banco de dados do [Censo](https://censo.ok.org.br) | Banco de dados de municípios. Contém metadados municipais. | | -| Recurso | Elasticsearch | Índices de busca textual. | | +| Recurso | Opensearch | Índices de busca textual. | | | Recurso | Mailjet | Serviço de envio de email. | | ## Como configurar o ambiente de desenvolvimento @@ -69,4 +69,4 @@ make coverage ``` # Mantendo -As pessoas mantenedoras devem seguir as diretrizes do [Guia para Mantenedoras](https://github.com/okfn-brasil/querido-diario-comunidade/blob/main/.github/CONTRIBUTING.md#mantendo) do Querido Diário. \ No newline at end of file +As pessoas mantenedoras devem seguir as diretrizes do [Guia para Mantenedoras](https://github.com/okfn-brasil/querido-diario-comunidade/blob/main/.github/CONTRIBUTING.md#mantendo) do Querido Diário. diff --git a/gazettes/gazette_access.py b/gazettes/gazette_access.py index 80f5689..e03ebb6 100644 --- a/gazettes/gazette_access.py +++ b/gazettes/gazette_access.py @@ -3,7 +3,7 @@ from typing import Dict, List, Union from index import SearchEngineInterface -from index.elasticsearch import ( +from index.opensearch import ( QueryBuilderInterface, DateRangeQueryMixin, SimpleStringQueryMixin, @@ -395,7 +395,6 @@ def create_gazettes_data_gateway( raise Exception( "Query builder should implement the QueryBuilderInterface interface" ) - return GazetteSearchEngineGateway(search_engine, query_builder, index) diff --git a/index/__init__.py b/index/__init__.py index c620d24..5db1254 100644 --- a/index/__init__.py +++ b/index/__init__.py @@ -1 +1 @@ -from .elasticsearch import create_search_engine_interface, SearchEngineInterface +from .opensearch import create_search_engine_interface, SearchEngineInterface diff --git a/index/elasticsearch.py b/index/opensearch.py similarity index 89% rename from index/elasticsearch.py rename to index/opensearch.py index 362504e..5ff01c4 100644 --- a/index/elasticsearch.py +++ b/index/opensearch.py @@ -1,10 +1,11 @@ import abc +import os import re from datetime import date from enum import Enum, unique -from typing import Dict, List, Union +from typing import Dict, List, Tuple, Union -import elasticsearch +import opensearchpy class SearchEngineInterface(abc.ABC): @@ -15,7 +16,7 @@ class SearchEngineInterface(abc.ABC): @abc.abstractmethod def search(self, query: Dict, index: str = "", timeout: int = 30) -> Dict: """ - Searches the index with the provided elasticsearch_dsl.Search + Searches the index with the provided opensearch_dsl.Search """ @abc.abstractmethod @@ -25,20 +26,20 @@ def index_exists(self, index: str) -> bool: """ -class ElasticSearch(SearchEngineInterface): - def __init__(self, host: str, default_index: str = ""): - self._es = elasticsearch.Elasticsearch(hosts=[host]) +class OpenSearch(SearchEngineInterface): + def __init__(self, host: str, credentials: Tuple[str, str]=("user", "pswd"), default_index: str = ""): + self._search_engine = opensearchpy.OpenSearch(hosts=[host], http_auth=credentials) self._default_index = default_index def search(self, query: Dict, index: str = "", timeout: int = 30) -> Dict: index_name = self._get_index_name(index) - response = self._es.search( + response = self._search_engine.search( index=index_name, body=query, request_timeout=timeout ) return response def index_exists(self, index: str) -> bool: - return self._es.indices.exists(index=index) + return self._search_engine.indices.exists(index=index) def _get_index_name(self, index: str) -> str: index_name = index if self._is_valid_index_name(index) else self._default_index @@ -48,7 +49,7 @@ def _get_index_name(self, index: str) -> str: def _is_valid_index_name(self, index: str) -> bool: return isinstance(index, str) and len(index) > 0 - + class QueryBuilderInterface(abc.ABC): @abc.abstractmethod @@ -213,10 +214,11 @@ def build_field_highlight( def create_search_engine_interface( - host: str = "", default_index: str = "" + host: str = "", credentials: Tuple[str, str]=("user", "pswd"), default_index: str = "" ) -> SearchEngineInterface: if not isinstance(host, str) or len(host.strip()) == 0: raise Exception("Missing host") if not isinstance(default_index, str): raise Exception("Invalid index name") - return ElasticSearch(host.strip(), default_index=default_index.strip()) + return OpenSearch(host.strip(), credentials=credentials, default_index=default_index.strip()) + diff --git a/main/__main__.py b/main/__main__.py index 7865f92..f1dcd92 100644 --- a/main/__main__.py +++ b/main/__main__.py @@ -22,7 +22,7 @@ configuration = load_configuration() search_engine = create_search_engine_interface( - configuration.host, configuration.gazette_index + configuration.host, (configuration.opensearch_user, configuration.opensearch_pswd), configuration.gazette_index ) gazettes_query_builder = create_gazettes_query_builder( diff --git a/requirements.txt b/requirements.txt index 8a00c13..d2fe7e3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,9 +2,9 @@ black==19.10b0 coverage==5.2.1 dateparser==0.7.6 fastapi==0.61.0 -requests==2.24.0 +requests==2.30.0 uvicorn==0.11.8 psycopg2==2.8.5 SQLAlchemy==1.3.19 -elasticsearch==7.9.1 +opensearch-py==2.3.2 mailjet-rest==1.3.4 diff --git a/scripts/load_fake_gazettes.py b/scripts/load_fake_gazettes.py index 2769d27..a067687 100644 --- a/scripts/load_fake_gazettes.py +++ b/scripts/load_fake_gazettes.py @@ -1,7 +1,7 @@ from datetime import date, timedelta import time -import elasticsearch +import opensearchpy TERRITORY_ID1 = "3304557" TERRITORY_ID2 = "4205902" @@ -10,11 +10,11 @@ INDEX = "gazettes" -def delete_index(es): +def delete_index(search_engine): for attempt in range(3): try: - es.indices.delete(index=INDEX, ignore_unavailable=True, timeout="30s") - es.indices.refresh() + search_engine.indices.delete(index=INDEX, ignore_unavailable=True, timeout="30s") + search_engine.indices.refresh() print("Index deleted") return except Exception as e: @@ -22,15 +22,15 @@ def delete_index(es): time.sleep(10) -def create_index(es): +def create_index(search_engine): for attempt in range(3): try: - es.indices.create( + search_engine.indices.create( index=INDEX, body={"mappings": {"properties": {"date": {"type": "date"}}}}, - timeout="30s", + timeout=30, ) - es.indices.refresh() + search_engine.indices.refresh() print(f"Index {INDEX} created") return except Exception as e: @@ -38,26 +38,26 @@ def create_index(es): time.sleep(10) -def recreate_index(es): - # delete_index(es) - create_index(es) +def recreate_index(search_engine): + delete_index(search_engine) + create_index(search_engine) -def try_push_data_to_index(es, bulk_data): +def try_push_data_to_index(search_engine, bulk_data): for attempt in range(3): try: - es.bulk(bulk_data, index=INDEX, refresh=True, timeout="30s") + search_engine.bulk(bulk_data, index=INDEX, refresh=True, timeout="30s") return except Exception as e: time.sleep(10) -def add_data_on_index(data, es): +def add_data_on_index(data, search_engine): bulk_data = [] for gazette in data: bulk_data.append({"index": {"_index": INDEX, "_id": gazette["file_checksum"]}}) bulk_data.append(gazette) - try_push_data_to_index(es, bulk_data) + try_push_data_to_index(search_engine, bulk_data) print("Index populated") @@ -300,9 +300,9 @@ def get_data(): def main(): - es = elasticsearch.Elasticsearch(hosts=["localhost"]) - recreate_index(es) - add_data_on_index(get_data(), es) + search_engine = opensearchpy.OpenSearch(hosts=["localhost"], http_auth=("admin", "admin")) + recreate_index(search_engine) + add_data_on_index(get_data(), search_engine) if __name__ == "__main__": diff --git a/tests/test_config.py b/tests/test_config.py index 2b8c830..a02f605 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -9,13 +9,13 @@ class BasicConfigurationTests(TestCase): def check_configuration_values(self, configuration, expected_values): self.assertEqual( configuration.host, - expected_values["QUERIDO_DIARIO_ELASTICSEARCH_HOST"], - msg="Invalid elasticsearch host", + expected_values["QUERIDO_DIARIO_OPENSEARCH_HOST"], + msg="Invalid opensearch host", ) self.assertEqual( configuration.index, - expected_values["QUERIDO_DIARIO_ELASTICSEARCH_INDEX"], - msg="Invalid elasticsearch index", + expected_values["QUERIDO_DIARIO_OPENSEARCH_INDEX"], + msg="Invalid opensearch index", ) self.assertEqual( configuration.root_path, @@ -49,8 +49,8 @@ def check_configuration_values(self, configuration, expected_values): ) def test_load_configuration_with_no_envvars(self): expected_config_dict = { - "QUERIDO_DIARIO_ELASTICSEARCH_HOST": "", - "QUERIDO_DIARIO_ELASTICSEARCH_INDEX": "", + "QUERIDO_DIARIO_OPENSEARCH_HOST": "", + "QUERIDO_DIARIO_OPENSEARCH_INDEX": "", "QUERIDO_DIARIO_API_ROOT_PATH": "", "QUERIDO_DIARIO_URL_PREFIX": "", "QUERIDO_DIARIO_CORS_ALLOW_ORIGINS": ["*"], @@ -64,8 +64,8 @@ def test_load_configuration_with_no_envvars(self): @patch.dict( "os.environ", { - "QUERIDO_DIARIO_ELASTICSEARCH_HOST": "", - "QUERIDO_DIARIO_ELASTICSEARCH_INDEX": "", + "QUERIDO_DIARIO_OPENSEARCH_HOST": "", + "QUERIDO_DIARIO_OPENSEARCH_INDEX": "", "QUERIDO_DIARIO_API_ROOT_PATH": "", "QUERIDO_DIARIO_URL_PREFIX": "", "QUERIDO_DIARIO_CORS_ALLOW_ORIGINS": "", @@ -77,8 +77,8 @@ def test_load_configuration_with_no_envvars(self): ) def test_load_configuration_with_empty_envvars(self): expected_config_dict = { - "QUERIDO_DIARIO_ELASTICSEARCH_HOST": "", - "QUERIDO_DIARIO_ELASTICSEARCH_INDEX": "", + "QUERIDO_DIARIO_OPENSEARCH_HOST": "", + "QUERIDO_DIARIO_OPENSEARCH_INDEX": "", "QUERIDO_DIARIO_API_ROOT_PATH": "", "QUERIDO_DIARIO_URL_PREFIX": "", "QUERIDO_DIARIO_CORS_ALLOW_ORIGINS": [""], @@ -92,8 +92,8 @@ def test_load_configuration_with_empty_envvars(self): @patch.dict( "os.environ", { - "QUERIDO_DIARIO_ELASTICSEARCH_HOST": "000.0.0.0", - "QUERIDO_DIARIO_ELASTICSEARCH_INDEX": "myindex", + "QUERIDO_DIARIO_OPENSEARCH_HOST": "000.0.0.0", + "QUERIDO_DIARIO_OPENSEARCH_INDEX": "myindex", "QUERIDO_DIARIO_API_ROOT_PATH": "api/", "QUERIDO_DIARIO_URL_PREFIX": "https://test.com", "QUERIDO_DIARIO_CORS_ALLOW_ORIGINS": "localhost", @@ -105,8 +105,8 @@ def test_load_configuration_with_empty_envvars(self): ) def test_load_configuration_with_envvars_defined(self): expected_config_dict = { - "QUERIDO_DIARIO_ELASTICSEARCH_HOST": "000.0.0.0", - "QUERIDO_DIARIO_ELASTICSEARCH_INDEX": "myindex", + "QUERIDO_DIARIO_OPENSEARCH_HOST": "000.0.0.0", + "QUERIDO_DIARIO_OPENSEARCH_INDEX": "myindex", "QUERIDO_DIARIO_API_ROOT_PATH": "api/", "QUERIDO_DIARIO_URL_PREFIX": "https://test.com", "QUERIDO_DIARIO_CORS_ALLOW_ORIGINS": ["localhost"], diff --git a/tests/test_elasticsearch.py b/tests/test_opensearch.py similarity index 93% rename from tests/test_elasticsearch.py rename to tests/test_opensearch.py index 3406a8e..e7cc646 100644 --- a/tests/test_elasticsearch.py +++ b/tests/test_opensearch.py @@ -6,56 +6,55 @@ import uuid import time -import elasticsearch +import opensearchpy -from index import ElasticSearchDataMapper, create_elasticsearch_data_mapper +from index import OpenSearchDataMapper, create_opensearch_data_mapper from gazettes import GazetteDataGateway, Gazette FILE_ENDPOINT = "http://test.com" -class ElasticSearchInterfaceTest(TestCase): - @patch("elasticsearch.Elasticsearch") - def test_create_elasticsearch_mapper(self, es_mock): - mapper = create_elasticsearch_data_mapper("localhost", "gazettes") +class OpenSearchInterfaceTest(TestCase): + @patch("opensearchpy.OpenSearch") + def test_create_opensearch_mapper(self, os_mock): + mapper = create_opensearch_data_mapper("localhost", "gazettes") self.assertIsInstance(mapper, GazetteDataGateway) - @patch("elasticsearch.Elasticsearch") + @patch("opensearchpy.OpenSearch") @unittest.expectedFailure - def test_create_elasticsearch_mapper_should_fail_without_host(self, es_mock): - create_elasticsearch_data_mapper() + def test_create_opensearch_mapper_should_fail_without_host(self, os_mock): + create_opensearch_data_mapper() - @patch("elasticsearch.Elasticsearch") - def test_create_elasticsearch_mapper_without_host(self, es_mock): + @patch("opensearchpy.OpenSearch") + def test_create_opensearch_mapper_without_host(self, os_mock): with self.assertRaisesRegex(Exception, "Missing host") as cm: - mapper = create_elasticsearch_data_mapper("", "gazettes") + mapper = create_opensearch_data_mapper("", "gazettes") - @patch("elasticsearch.Elasticsearch") - def test_create_elasticsearch_mapper_without_index_name(self, es_mock): + @patch("opensearchpy.OpenSearch") + def test_create_opensearch_mapper_without_index_name(self, os_mock): with self.assertRaisesRegex(Exception, "Missing index name") as cm: - mapper = create_elasticsearch_data_mapper("localhost") + mapper = create_opensearch_data_mapper("localhost") - def configure_es_mock_to_return_itself_in_the_es_constructor( - self, es_mock, indices_mock + def configure_search_engine_mock_to_return_itself_in_the_search_engine_constructor( + self, os_mock, indices_mock ): - es_mock.indices = indices_mock - es_mock.return_value = es_mock + os_mock.indices = indices_mock + os_mock.return_value = os_mock - @patch("elasticsearch.Elasticsearch") - @patch("elasticsearch.client.IndicesClient") - def test_create_elasticsearch_mapper_using_non_existing_index_should_fail( - self, indices_mock, es_mock + @patch("opensearchpy.OpenSearch") + def test_create_opensearch_mapper_using_non_existing_index_should_fail( + self, indices_mock, os_mock ): indices_mock.exists.return_value = False - self.configure_es_mock_to_return_itself_in_the_es_constructor( - es_mock, indices_mock + self.configure_search_engine_mock_to_return_itself_in_the_search_engine_constructor( + os_mock, indices_mock ) with self.assertRaisesRegex(Exception, "Index does not exist") as cm: - create_elasticsearch_data_mapper("localhost", "zpto") + create_opensearch_data_mapper("localhost", "zpto") -class ElasticSearchBaseTestCase(TestCase): +class OpenSearchBaseTestCase(TestCase): INDEX = "gazettes" _data = [] @@ -113,11 +112,11 @@ def build_expected_query( return query def setUp(self): - self.es_mock = self.create_patch("elasticsearch.Elasticsearch") - self.indices_mock = self.create_patch("elasticsearch.client.IndicesClient") + self.os_mock = self.create_patch("opensearchpy.OpenSearch") + self.indices_mock = self.create_patch("opensearchpy.client.IndicesClient") self.generate_data() - self._mapper = ElasticSearchDataMapper("localhost", self.INDEX) - self.configure_es_mock_to_return_itself_in_the_es_constructor() + self._mapper = OpenSearchDataMapper("localhost", self.INDEX) + self.configure_search_engine_mock_to_return_itself_in_the_search_engine_constructor() self.set_mock_search_return() def create_patch(self, name): @@ -125,9 +124,9 @@ def create_patch(self, name): self.addCleanup(patcher.stop) return patcher.start() - def configure_es_mock_to_return_itself_in_the_es_constructor(self): - self.es_mock.indices = self.indices_mock - self.es_mock = self.es_mock.return_value + def configure_search_engine_mock_to_return_itself_in_the_search_engine_constructor(self): + self.os_mock.indices = self.indices_mock + self.os_mock = self.os_mock.return_value def set_mock_search_return(self): hits = [ @@ -141,7 +140,7 @@ def set_mock_search_return(self): } for hit in self._data ] - self.es_mock.search.return_value = { + self.os_mock.search.return_value = { "took": 4, "timed_out": False, "_shards": {"total": 1, "successful": 1, "skipped": 0, "failed": 0}, @@ -176,7 +175,7 @@ def get_expected_document_page(self, page_number, page_size): return expected_gazettes[start_slice:end_slice] -class ElasticSearchDataMapperTest(ElasticSearchBaseTestCase): +class OpenSearchDataMapperTest(OpenSearchBaseTestCase): TERRITORY_ID1 = "3304557" TERRITORY_ID2 = "4205902" @@ -435,7 +434,7 @@ def assert_query_body_is_correct( offset=offset, size=size, ) - self.es_mock.search.assert_called_with(body=expected_query, index=self.INDEX) + self.os_mock.search.assert_called_with(body=expected_query, index=self.INDEX) def test_get_none_gazettes(self): self._data = [] @@ -485,7 +484,7 @@ def test_return_get_gazettes_sort_by_date_in_descending_order(self): self.assertGreater(len(gazettes), 0) self.assertGreater(gazettes[0].date, gazettes[-1].date) - def set_empty_es_return(self): + def set_empty_search_engine_return(self): self._data = [] self.set_mock_search_return() @@ -564,13 +563,13 @@ def test_get_gazettes_by_querystring(self): self.assertCountEqual(gazettes, expected_gazettes) def test_get_gazettes_by_invalid_since_date(self): - self.set_empty_es_return() + self.set_emptysearch_engine_return() two_months_future = date.today() + timedelta(weeks=8) gazettes = self._mapper.get_gazettes(since=two_months_future)[1] self.assertEqual(0, len(gazettes), msg="No gazettes should be return ") def test_get_gazettes_by_invalid_until_date(self): - self.set_empty_es_return() + self.set_emptysearch_engine_return() two_months_ago = date.today() - timedelta(weeks=8) gazettes = self._mapper.get_gazettes(until=two_months_ago)[1] self.assertEqual(0, len(gazettes), msg="No gazettes should be return ") @@ -585,16 +584,16 @@ def is_running_integration_tests(): return os.environ.get("RUN_INTEGRATION_TESTS", 0) == "1" -class ElasticSearchIntegrationBaseTestCase(TestCase): +class OpenSearchIntegrationBaseTestCase(TestCase): _data = [] def delete_index(self): for attempt in range(3): try: - self._es.indices.delete( - index=self.INDEX, ignore_unavailable=True, timeout="30s" + self.search_engine.indices.delete( + index=self.INDEX, ignore_unavailable=True, timeout=30 ) - self._es.indices.refresh() + self.search_engine.indices.refresh() return except Exception as e: time.sleep(10) @@ -602,12 +601,12 @@ def delete_index(self): def create_index(self): for attempt in range(3): try: - self._es.indices.create( + self.search_engine.indices.create( index=self.INDEX, body={"mappings": {"properties": {"date": {"type": "date"}}}}, - timeout="30s", + timeout=30, ) - self._es.indices.refresh() + self.search_engine.indices.refresh() return except Exception as e: time.sleep(10) @@ -619,7 +618,7 @@ def recreate_index(self): def try_push_data_to_index(self, bulk_data): for attempt in range(3): try: - self._es.bulk(bulk_data, index=self.INDEX, refresh=True, timeout="30s") + self.search_engine.bulk(bulk_data, index=self.INDEX, refresh=True, timeout=30) return except Exception as e: time.sleep(10) @@ -634,14 +633,14 @@ def add_data_on_index(self): self.try_push_data_to_index(bulk_data) def setUp(self): - self._es = elasticsearch.Elasticsearch(hosts=["localhost"]) + self.search_engine = opensearchpy.OpenSearch(hosts=["localhost"]) self.recreate_index() self.generate_data() self.add_data_on_index() - self._mapper = create_elasticsearch_data_mapper("localhost", self.INDEX) + self._mapper = create_opensearch_data_mapper("localhost", self.INDEX) def tearDown(self): - self._es.close() + self.search_engine.close() def get_latest_gazettes_files(self, gazettes_count): self._data.sort(reverse=True, key=lambda x: x["date"]) @@ -667,7 +666,7 @@ def get_expected_document_page(self, offset, size): @skipUnless(is_running_integration_tests(), "Integration tests disable") -class ElasticSearchDataMapperPaginationTest(ElasticSearchIntegrationBaseTestCase): +class OpenSearchDataMapperPaginationTest(OpenSearchIntegrationBaseTestCase): INDEX = "gazettes_pagination" TERRITORY_ID = "3304557" @@ -718,7 +717,7 @@ def assert_basic_function_calls( offset=offset, size=size, ) - self.es_mock.search.assert_called_with(body=expected_query, index=self.INDEX) + self.os_mock.search.assert_called_with(body=expected_query, index=self.INDEX) def test_page_size(self): gazettes = self._mapper.get_gazettes( @@ -829,7 +828,7 @@ def test_get_all_pages_available(self): @skipUnless(is_running_integration_tests(), "Integration tests disable") -class ElasticSearchDataMapperQuerystringTest(ElasticSearchIntegrationBaseTestCase): +class OpenSearchDataMapperQuerystringTest(OpenSearchIntegrationBaseTestCase): INDEX = "gazettes_querystring" @@ -1059,7 +1058,7 @@ def test_get_gazettes_by_precedence_operator(self): ) -class Elasticsearch(TestCase): +class Opensearch(TestCase): def setUp(self): self.host = "localhost" self.index = "gazettes" @@ -1267,22 +1266,21 @@ def setUp(self): }, } - def test_elasticsearch_data_mapper_creation(self): - with patch("elasticsearch.Elasticsearch") as es_mock: - es = ElasticSearchDataMapper(self.host, self.index) - es._es.indices.exists.assert_called_once() + def test_opensearch_data_mapper_creation(self): + with patch("opensearchpy.OpenSearch") as os_mock: + data_mapper = OpenSearchDataMapper(self.host, self.index) + data_mapper.search_engine.indices.exists.assert_called_once() - @patch("elasticsearch.Elasticsearch") - def test_get_total_number_items(self, es_mock): - es = ElasticSearchDataMapper(self.host, self.index) - total_items = es.get_total_number_items(self.search_result_json) + @patch("opensearchpy.OpenSearch") + def test_get_total_number_items(self, os_mock): + data_mapper = OpenSearchDataMapper(self.host, self.index) + total_items = data_mapper.get_total_number_items(self.search_result_json) self.assertEqual(total_items, 8) - @patch("elasticsearch.Elasticsearch") - def test_total_number_of_items_found_return(self, es_mock): - es_mock.search.return_value = self.search_result_json - es = ElasticSearchDataMapper(self.host, self.index) - es._es = es_mock - - total_items, _ = es.get_gazettes("4205920", None, None, None, 1, 4) + @patch("opensearchpy.OpenSearch") + def test_total_number_of_items_found_return(self, os_mock): + os_mock.search.return_value = self.search_result_json + data_mapper = OpenSearchDataMapper(self.host, self.index) + data_mapper.search_engine = os_mock + total_items, _ = open_search.get_gazettes("4205920", None, None, None, 1, 4) self.assertEqual(total_items, 8) diff --git a/themed_excerpts/themed_excerpt_access.py b/themed_excerpts/themed_excerpt_access.py index 1383727..ad9f3c8 100644 --- a/themed_excerpts/themed_excerpt_access.py +++ b/themed_excerpts/themed_excerpt_access.py @@ -5,7 +5,7 @@ from typing import Dict, List, Tuple, Union from index import SearchEngineInterface -from index.elasticsearch import ( +from index.opensearch import ( QueryBuilderInterface, DateRangeQueryMixin, SimpleStringQueryMixin,