feat: Create AI Search index automatically

clemlesne · Aug 16, 2024 · d3bcf31 · d3bcf31
1 parent e428dbd
commit d3bcf31
Show file tree

Hide file tree

Showing 7 changed files with 127 additions and 22 deletions.
diff --git a/.env.example b/.env.example
@@ -3,16 +3,15 @@ AZURE_OPENAI_API_KEY=xxx
 AZURE_OPENAI_CHAT_DEPLOYMENT_NAME=gpt-4o-2024-05-13
 AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME=text-embedding-3-large-1
 AZURE_OPENAI_EMBEDDING_DIMENSIONS=3072
+AZURE_OPENAI_EMBEDDING_MODEL_NAME=text-embedding-3-large
 AZURE_OPENAI_ENDPOINT=https://xxx.openai.azure.com
 
 # AI Search
 AZURE_SEARCH_API_KEY=xxx
 AZURE_SEARCH_ENDPOINT=https://xxx.search.windows.net
-AZURE_SEARCH_INDEX_NAME=learn
 
 # Blob Storage
 AZURE_STORAGE_CONNECTION_STRING=DefaultEndpointsProtocol=https;AccountName=xxx;AccountKey=xxx;EndpointSuffix=core.windows.net
-AZURE_STORAGE_CONTAINER=learn-scraping
 
 # Application Insights
 APPLICATIONINSIGHTS_CONNECTION_STRING=xxx
diff --git a/README.md b/README.md
@@ -23,6 +23,7 @@ Scraper:
 
 Indexer:
 
+- [x] AI Search index is created automatically
 - [x] Chunck markdown while keeping the content coherent
 - [x] Embed chuncks with OpenAI embeddings
 - [x] Indexed content is semantically searchable with [Azure AI Search](https://learn.microsoft.com/en-us/azure/search)
@@ -103,10 +104,10 @@ Basic usage:
 export AZURE_OPENAI_API_KEY=xxx
 export AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME=xxx
 export AZURE_OPENAI_EMBEDDING_DIMENSIONS=xxx
+export AZURE_OPENAI_EMBEDDING_MODEL_NAME=xxx
 export AZURE_OPENAI_ENDPOINT=xxx
 export AZURE_SEARCH_API_KEY=xxx
 export AZURE_SEARCH_ENDPOINT=xxx
-export AZURE_SEARCH_INDEX_NAME=xxx
 export AZURE_STORAGE_CONNECTION_STRING=xxx
 scrape-it-now index run [job_name]
 ```
@@ -131,10 +132,10 @@ Most frequent options are:
 | `--azure-openai-api-key`</br>`-aoak` | Azure OpenAI API key | `AZURE_OPENAI_API_KEY` |
 | `--azure-openai-embedding-deployment-name`</br>`-aoedn` | Azure OpenAI embedding deployment name | `AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME` |
 | `--azure-openai-embedding-dimensions`</br>`-aoed` | Azure OpenAI embedding dimensions | `AZURE_OPENAI_EMBEDDING_DIMENSIONS` |
+| `--azure-openai-embedding-model-name`</br>`-aoemn` | Azure OpenAI embedding model name | `AZURE_OPENAI_EMBEDDING_MODEL_NAME` |
 | `--azure-openai-endpoint`</br>`-aoe` | Azure OpenAI endpoint | `AZURE_OPENAI_ENDPOINT` |
 | `--azure-search-api-key`</br>`-asak` | Azure Search API key | `AZURE_SEARCH_API_KEY` |
 | `--azure-search-endpoint`</br>`-ase` | Azure Search endpoint | `AZURE_SEARCH_ENDPOINT` |
-| `--azure-search-index-name`</br>`-asin` | Azure Search index name | `AZURE_SEARCH_INDEX_NAME` |
 | `--azure-storage-connection-string`</br>`-ascs` | Azure Storage connection string | `AZURE_STORAGE_CONNECTION_STRING` |
 
 For documentation on all available options, run:

diff --git a/app/app.py b/app/app.py
@@ -268,6 +268,13 @@ def index() -> None:
     required=True,
     type=str,
 )
+@click.option(
+    "--azure-openai-embedding-model-name",
+    "-aoemn",
+    envvar="AZURE_OPENAI_EMBEDDING_MODEL_NAME",
+    required=True,
+    type=str,
+)
 @click.option(
     "--processes",
     "-p",
@@ -290,13 +297,6 @@ def index() -> None:
     required=True,
     type=str,
 )
-@click.option(
-    "--azure-search-index-name",
-    "-asin",
-    envvar="AZURE_SEARCH_INDEX_NAME",
-    required=True,
-    type=str,
-)
 @click.option(
     "--azure-openai-api-key",
     "-aoak",
@@ -329,10 +329,10 @@ async def index_run(
     azure_openai_api_key: str,
     azure_openai_embedding_deployment_name: str,
     azure_openai_embedding_dimensions: int,
+    azure_openai_embedding_model_name: str,
     azure_openai_endpoint: str,
     azure_search_api_key: str,
     azure_search_endpoint: str,
-    azure_search_index_name: str,
     azure_storage_connection_string: str,
     job_name: str | None,
     openai_api_version: str,
@@ -349,13 +349,13 @@ async def index_run(
         azure_openai_api_key=azure_openai_api_key,
         azure_openai_embedding_deployment=azure_openai_embedding_deployment_name,
         azure_openai_embedding_dimensions=azure_openai_embedding_dimensions,
+        azure_openai_embedding_model=azure_openai_embedding_model_name,
         azure_openai_endpoint=azure_openai_endpoint,
         job=job_name,
         openai_api_version=openai_api_version,
         processes=processes,
         search_api_key=azure_search_api_key,
         search_endpoint=azure_search_endpoint,
-        search_index=azure_search_index_name,
         storage_connection_string=azure_storage_connection_string,
     )
 

diff --git a/app/helpers/persistence.py b/app/helpers/persistence.py
@@ -4,9 +4,23 @@
 from azure.core.credentials import AzureKeyCredential
 from azure.core.exceptions import ResourceExistsError
 from azure.search.documents.aio import SearchClient
+from azure.search.documents.indexes.aio import SearchIndexClient
 from azure.storage.blob.aio import BlobServiceClient, ContainerClient
 from azure.storage.queue.aio import QueueClient, QueueServiceClient
 from openai import AsyncAzureOpenAI
+from azure.search.documents.indexes.models import (
+    AzureOpenAIParameters,
+    AzureOpenAIVectorizer,
+    HnswAlgorithmConfiguration,
+    LexicalAnalyzerName,
+    SearchableField,
+    SearchField,
+    SearchFieldDataType,
+    SearchIndex,
+    SimpleField,
+    VectorSearch,
+    VectorSearchProfile,
+)
 
 from app.helpers.logging import logger
 
@@ -33,12 +47,91 @@ async def openai_client(
 @asynccontextmanager
 async def search_client(
     api_key: str,
+    azure_openai_api_key: str,
+    azure_openai_embedding_deployment: str,
+    azure_openai_embedding_dimensions: int,
+    azure_openai_embedding_model: str,
+    azure_openai_endpoint: str,
     endpoint: str,
     index: str,
 ) -> AsyncGenerator[SearchClient, None]:
     """
     Get the Azure AI Search client.
     """
+    # Index configuration
+    fields = [
+        SimpleField(
+            name="id",
+            key=True,
+            type=SearchFieldDataType.String,
+        ),
+        SearchableField(
+            name="content",
+            analyzer_name=LexicalAnalyzerName.STANDARD_LUCENE,
+            type=SearchFieldDataType.String,
+        ),
+        SimpleField(
+            name="url",
+            analyzer_name=LexicalAnalyzerName.STANDARD_LUCENE,
+            filterable=True,
+            sortable=True,
+            type=SearchFieldDataType.String,
+        ),
+        SearchField(
+            name="vectors",
+            searchable=True,
+            type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
+            vector_search_dimensions=azure_openai_embedding_dimensions,
+            vector_search_profile_name="default",
+        ),
+    ]
+    vector_search = VectorSearch(
+        profiles=[
+            VectorSearchProfile(
+                algorithm_configuration_name="default",
+                name="default",
+            ),
+        ],
+        algorithms=[
+            HnswAlgorithmConfiguration(
+                name="default",
+            ),
+        ],
+        vectorizers=[
+            AzureOpenAIVectorizer(
+                name="default",
+                azure_open_ai_parameters=AzureOpenAIParameters(
+                    api_key=azure_openai_api_key,
+                    deployment_id=azure_openai_embedding_deployment,
+                    model_name=azure_openai_embedding_model,
+                    resource_uri=azure_openai_endpoint,
+                )
+            )
+        ]
+    )
+
+    # Create index if it does not exist
+    async with SearchIndexClient(
+        # Deployment
+        endpoint=endpoint,
+        index_name=index,
+        # Index configuration
+        fields=fields,
+        vector_search=vector_search,
+        # Authentication
+        credential=AzureKeyCredential(api_key),
+    ) as client:
+        try:
+            await client.create_index(SearchIndex(
+                fields=fields,
+                name=index,
+                vector_search=vector_search,
+            ))
+            logger.info('Created Search "%s"', index)
+        except ResourceExistsError:
+            pass
+
+    # Return client
     async with SearchClient(
         # Deployment
         endpoint=endpoint,

diff --git a/app/helpers/resources.py b/app/helpers/resources.py
@@ -34,13 +34,20 @@ def scrape_queue_name(job_name: str) -> str:
     return f"{job_name}-to-scrape"
 
 
-def chunck_queue_name(job_name: str) -> str:
+def index_queue_name(job_name: str) -> str:
     """
     Get the output queue name for the job.
     """
     return f"{job_name}-to-chunk"
 
 
+def index_index_name(job_name: str) -> str:
+    """
+    Get the index name for the job.
+    """
+    return job_name
+
+
 def hash_url(url: str) -> str:
     """
     Hash a URL to a unique identifier.

diff --git a/app/index.py b/app/index.py
@@ -26,7 +26,7 @@
     queue_client,
     search_client,
 )
-from app.helpers.resources import chunck_queue_name, hash_url, scrape_container_name
+from app.helpers.resources import index_queue_name, hash_url, scrape_container_name, index_index_name
 from app.helpers.threading import run_workers
 from app.models.indexed import IndexedIngestModel
 from app.models.scraped import ScrapedUrlModel
@@ -334,13 +334,13 @@ async def run(
     azure_openai_api_key: str,
     azure_openai_embedding_deployment: str,
     azure_openai_embedding_dimensions: int,
+    azure_openai_embedding_model: str,
     azure_openai_endpoint: str,
     job: str,
     openai_api_version: str,
     processes: int,
     search_api_key: str,
     search_endpoint: str,
-    search_index: str,
     storage_connection_string: str,
 ) -> None:
     logger.info("Starting indexing job %s", job)
@@ -349,6 +349,7 @@ async def run(
         azure_openai_api_key=azure_openai_api_key,
         azure_openai_embedding_deployment=azure_openai_embedding_deployment,
         azure_openai_embedding_dimensions=azure_openai_embedding_dimensions,
+        azure_openai_embedding_model=azure_openai_embedding_model,
         azure_openai_endpoint=azure_openai_endpoint,
         count=processes,
         func=_worker,
@@ -357,7 +358,6 @@ async def run(
         openai_api_version=openai_api_version,
         search_api_key=search_api_key,
         search_endpoint=search_endpoint,
-        search_index=search_index,
         storage_connection_string=storage_connection_string,
     )
 
@@ -366,12 +366,12 @@ async def _worker(
     azure_openai_api_key: str,
     azure_openai_embedding_deployment: str,
     azure_openai_embedding_dimensions: int,
+    azure_openai_embedding_model: str,
     azure_openai_endpoint: str,
     job: str,
     openai_api_version: str,
     search_api_key: str,
     search_endpoint: str,
-    search_index: str,
     storage_connection_string: str,
 ) -> None:
     # Init clients
@@ -386,12 +386,17 @@ async def _worker(
         ) as openai:
             async with queue_client(
                 connection_string=storage_connection_string,
-                queue=chunck_queue_name(job),
+                queue=index_queue_name(job),
             ) as queue:
                 async with search_client(
                     api_key=search_api_key,
+                    azure_openai_api_key=azure_openai_api_key,
+                    azure_openai_embedding_deployment=azure_openai_embedding_deployment,
+                    azure_openai_embedding_dimensions=azure_openai_embedding_dimensions,
+                    azure_openai_embedding_model=azure_openai_embedding_model,
+                    azure_openai_endpoint=azure_openai_endpoint,
                     endpoint=search_endpoint,
-                    index=search_index,
+                    index=index_index_name(job),
                 ) as search:
 
                     # Process the queue

diff --git a/app/scrape.py b/app/scrape.py
@@ -21,7 +21,7 @@
 from app.helpers.logging import logger
 from app.helpers.persistence import blob_client, queue_client
 from app.helpers.resources import (
-    chunck_queue_name,
+    index_queue_name,
     hash_url,
     resources_dir,
     scrape_container_name,
@@ -329,7 +329,7 @@ async def _worker(
         ) as in_queue:
             async with queue_client(
                 connection_string=storage_connection_string,
-                queue=chunck_queue_name(job),
+                queue=index_queue_name(job),
             ) as out_queue:
 
                 # Init Playwright context