Skip to content

Commit

Permalink
Merge branch 'develop'
Browse files Browse the repository at this point in the history
  • Loading branch information
clemlesne committed Aug 16, 2024
2 parents 3f110ad + d3bcf31 commit eb4110f
Show file tree
Hide file tree
Showing 7 changed files with 127 additions and 22 deletions.
3 changes: 1 addition & 2 deletions .env.example
Original file line number Diff line number Diff line change
Expand Up @@ -3,16 +3,15 @@ AZURE_OPENAI_API_KEY=xxx
AZURE_OPENAI_CHAT_DEPLOYMENT_NAME=gpt-4o-2024-05-13
AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME=text-embedding-3-large-1
AZURE_OPENAI_EMBEDDING_DIMENSIONS=3072
AZURE_OPENAI_EMBEDDING_MODEL_NAME=text-embedding-3-large
AZURE_OPENAI_ENDPOINT=https://xxx.openai.azure.com

# AI Search
AZURE_SEARCH_API_KEY=xxx
AZURE_SEARCH_ENDPOINT=https://xxx.search.windows.net
AZURE_SEARCH_INDEX_NAME=learn

# Blob Storage
AZURE_STORAGE_CONNECTION_STRING=DefaultEndpointsProtocol=https;AccountName=xxx;AccountKey=xxx;EndpointSuffix=core.windows.net
AZURE_STORAGE_CONTAINER=learn-scraping

# Application Insights
APPLICATIONINSIGHTS_CONNECTION_STRING=xxx
5 changes: 3 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ Scraper:

Indexer:

- [x] AI Search index is created automatically
- [x] Chunck markdown while keeping the content coherent
- [x] Embed chuncks with OpenAI embeddings
- [x] Indexed content is semantically searchable with [Azure AI Search](https://learn.microsoft.com/en-us/azure/search)
Expand Down Expand Up @@ -103,10 +104,10 @@ Basic usage:
export AZURE_OPENAI_API_KEY=xxx
export AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME=xxx
export AZURE_OPENAI_EMBEDDING_DIMENSIONS=xxx
export AZURE_OPENAI_EMBEDDING_MODEL_NAME=xxx
export AZURE_OPENAI_ENDPOINT=xxx
export AZURE_SEARCH_API_KEY=xxx
export AZURE_SEARCH_ENDPOINT=xxx
export AZURE_SEARCH_INDEX_NAME=xxx
export AZURE_STORAGE_CONNECTION_STRING=xxx
scrape-it-now index run [job_name]
```
Expand All @@ -131,10 +132,10 @@ Most frequent options are:
| `--azure-openai-api-key`</br>`-aoak` | Azure OpenAI API key | `AZURE_OPENAI_API_KEY` |
| `--azure-openai-embedding-deployment-name`</br>`-aoedn` | Azure OpenAI embedding deployment name | `AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME` |
| `--azure-openai-embedding-dimensions`</br>`-aoed` | Azure OpenAI embedding dimensions | `AZURE_OPENAI_EMBEDDING_DIMENSIONS` |
| `--azure-openai-embedding-model-name`</br>`-aoemn` | Azure OpenAI embedding model name | `AZURE_OPENAI_EMBEDDING_MODEL_NAME` |
| `--azure-openai-endpoint`</br>`-aoe` | Azure OpenAI endpoint | `AZURE_OPENAI_ENDPOINT` |
| `--azure-search-api-key`</br>`-asak` | Azure Search API key | `AZURE_SEARCH_API_KEY` |
| `--azure-search-endpoint`</br>`-ase` | Azure Search endpoint | `AZURE_SEARCH_ENDPOINT` |
| `--azure-search-index-name`</br>`-asin` | Azure Search index name | `AZURE_SEARCH_INDEX_NAME` |
| `--azure-storage-connection-string`</br>`-ascs` | Azure Storage connection string | `AZURE_STORAGE_CONNECTION_STRING` |

For documentation on all available options, run:
Expand Down
18 changes: 9 additions & 9 deletions app/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -268,6 +268,13 @@ def index() -> None:
required=True,
type=str,
)
@click.option(
"--azure-openai-embedding-model-name",
"-aoemn",
envvar="AZURE_OPENAI_EMBEDDING_MODEL_NAME",
required=True,
type=str,
)
@click.option(
"--processes",
"-p",
Expand All @@ -290,13 +297,6 @@ def index() -> None:
required=True,
type=str,
)
@click.option(
"--azure-search-index-name",
"-asin",
envvar="AZURE_SEARCH_INDEX_NAME",
required=True,
type=str,
)
@click.option(
"--azure-openai-api-key",
"-aoak",
Expand Down Expand Up @@ -329,10 +329,10 @@ async def index_run(
azure_openai_api_key: str,
azure_openai_embedding_deployment_name: str,
azure_openai_embedding_dimensions: int,
azure_openai_embedding_model_name: str,
azure_openai_endpoint: str,
azure_search_api_key: str,
azure_search_endpoint: str,
azure_search_index_name: str,
azure_storage_connection_string: str,
job_name: str | None,
openai_api_version: str,
Expand All @@ -349,13 +349,13 @@ async def index_run(
azure_openai_api_key=azure_openai_api_key,
azure_openai_embedding_deployment=azure_openai_embedding_deployment_name,
azure_openai_embedding_dimensions=azure_openai_embedding_dimensions,
azure_openai_embedding_model=azure_openai_embedding_model_name,
azure_openai_endpoint=azure_openai_endpoint,
job=job_name,
openai_api_version=openai_api_version,
processes=processes,
search_api_key=azure_search_api_key,
search_endpoint=azure_search_endpoint,
search_index=azure_search_index_name,
storage_connection_string=azure_storage_connection_string,
)

Expand Down
93 changes: 93 additions & 0 deletions app/helpers/persistence.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,23 @@
from azure.core.credentials import AzureKeyCredential
from azure.core.exceptions import ResourceExistsError
from azure.search.documents.aio import SearchClient
from azure.search.documents.indexes.aio import SearchIndexClient
from azure.storage.blob.aio import BlobServiceClient, ContainerClient
from azure.storage.queue.aio import QueueClient, QueueServiceClient
from openai import AsyncAzureOpenAI
from azure.search.documents.indexes.models import (
AzureOpenAIParameters,
AzureOpenAIVectorizer,
HnswAlgorithmConfiguration,
LexicalAnalyzerName,
SearchableField,
SearchField,
SearchFieldDataType,
SearchIndex,
SimpleField,
VectorSearch,
VectorSearchProfile,
)

from app.helpers.logging import logger

Expand All @@ -33,12 +47,91 @@ async def openai_client(
@asynccontextmanager
async def search_client(
api_key: str,
azure_openai_api_key: str,
azure_openai_embedding_deployment: str,
azure_openai_embedding_dimensions: int,
azure_openai_embedding_model: str,
azure_openai_endpoint: str,
endpoint: str,
index: str,
) -> AsyncGenerator[SearchClient, None]:
"""
Get the Azure AI Search client.
"""
# Index configuration
fields = [
SimpleField(
name="id",
key=True,
type=SearchFieldDataType.String,
),
SearchableField(
name="content",
analyzer_name=LexicalAnalyzerName.STANDARD_LUCENE,
type=SearchFieldDataType.String,
),
SimpleField(
name="url",
analyzer_name=LexicalAnalyzerName.STANDARD_LUCENE,
filterable=True,
sortable=True,
type=SearchFieldDataType.String,
),
SearchField(
name="vectors",
searchable=True,
type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
vector_search_dimensions=azure_openai_embedding_dimensions,
vector_search_profile_name="default",
),
]
vector_search = VectorSearch(
profiles=[
VectorSearchProfile(
algorithm_configuration_name="default",
name="default",
),
],
algorithms=[
HnswAlgorithmConfiguration(
name="default",
),
],
vectorizers=[
AzureOpenAIVectorizer(
name="default",
azure_open_ai_parameters=AzureOpenAIParameters(
api_key=azure_openai_api_key,
deployment_id=azure_openai_embedding_deployment,
model_name=azure_openai_embedding_model,
resource_uri=azure_openai_endpoint,
)
)
]
)

# Create index if it does not exist
async with SearchIndexClient(
# Deployment
endpoint=endpoint,
index_name=index,
# Index configuration
fields=fields,
vector_search=vector_search,
# Authentication
credential=AzureKeyCredential(api_key),
) as client:
try:
await client.create_index(SearchIndex(
fields=fields,
name=index,
vector_search=vector_search,
))
logger.info('Created Search "%s"', index)
except ResourceExistsError:
pass

# Return client
async with SearchClient(
# Deployment
endpoint=endpoint,
Expand Down
9 changes: 8 additions & 1 deletion app/helpers/resources.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,13 +34,20 @@ def scrape_queue_name(job_name: str) -> str:
return f"{job_name}-to-scrape"


def chunck_queue_name(job_name: str) -> str:
def index_queue_name(job_name: str) -> str:
"""
Get the output queue name for the job.
"""
return f"{job_name}-to-chunk"


def index_index_name(job_name: str) -> str:
"""
Get the index name for the job.
"""
return job_name


def hash_url(url: str) -> str:
"""
Hash a URL to a unique identifier.
Expand Down
17 changes: 11 additions & 6 deletions app/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
queue_client,
search_client,
)
from app.helpers.resources import chunck_queue_name, hash_url, scrape_container_name
from app.helpers.resources import index_queue_name, hash_url, scrape_container_name, index_index_name
from app.helpers.threading import run_workers
from app.models.indexed import IndexedIngestModel
from app.models.scraped import ScrapedUrlModel
Expand Down Expand Up @@ -334,13 +334,13 @@ async def run(
azure_openai_api_key: str,
azure_openai_embedding_deployment: str,
azure_openai_embedding_dimensions: int,
azure_openai_embedding_model: str,
azure_openai_endpoint: str,
job: str,
openai_api_version: str,
processes: int,
search_api_key: str,
search_endpoint: str,
search_index: str,
storage_connection_string: str,
) -> None:
logger.info("Starting indexing job %s", job)
Expand All @@ -349,6 +349,7 @@ async def run(
azure_openai_api_key=azure_openai_api_key,
azure_openai_embedding_deployment=azure_openai_embedding_deployment,
azure_openai_embedding_dimensions=azure_openai_embedding_dimensions,
azure_openai_embedding_model=azure_openai_embedding_model,
azure_openai_endpoint=azure_openai_endpoint,
count=processes,
func=_worker,
Expand All @@ -357,7 +358,6 @@ async def run(
openai_api_version=openai_api_version,
search_api_key=search_api_key,
search_endpoint=search_endpoint,
search_index=search_index,
storage_connection_string=storage_connection_string,
)

Expand All @@ -366,12 +366,12 @@ async def _worker(
azure_openai_api_key: str,
azure_openai_embedding_deployment: str,
azure_openai_embedding_dimensions: int,
azure_openai_embedding_model: str,
azure_openai_endpoint: str,
job: str,
openai_api_version: str,
search_api_key: str,
search_endpoint: str,
search_index: str,
storage_connection_string: str,
) -> None:
# Init clients
Expand All @@ -386,12 +386,17 @@ async def _worker(
) as openai:
async with queue_client(
connection_string=storage_connection_string,
queue=chunck_queue_name(job),
queue=index_queue_name(job),
) as queue:
async with search_client(
api_key=search_api_key,
azure_openai_api_key=azure_openai_api_key,
azure_openai_embedding_deployment=azure_openai_embedding_deployment,
azure_openai_embedding_dimensions=azure_openai_embedding_dimensions,
azure_openai_embedding_model=azure_openai_embedding_model,
azure_openai_endpoint=azure_openai_endpoint,
endpoint=search_endpoint,
index=search_index,
index=index_index_name(job),
) as search:

# Process the queue
Expand Down
4 changes: 2 additions & 2 deletions app/scrape.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
from app.helpers.logging import logger
from app.helpers.persistence import blob_client, queue_client
from app.helpers.resources import (
chunck_queue_name,
index_queue_name,
hash_url,
resources_dir,
scrape_container_name,
Expand Down Expand Up @@ -329,7 +329,7 @@ async def _worker(
) as in_queue:
async with queue_client(
connection_string=storage_connection_string,
queue=chunck_queue_name(job),
queue=index_queue_name(job),
) as out_queue:

# Init Playwright context
Expand Down

0 comments on commit eb4110f

Please sign in to comment.