From 9b4f24ebce075570f9f8691044e1775db9973020 Mon Sep 17 00:00:00 2001 From: Nathan Evans Date: Fri, 15 Nov 2024 14:33:26 -0800 Subject: [PATCH] First cut at config cleanup (#1411) * Firsst cut at config cleanup * Reorder top nav * Add query prompts to tuning page * Remove dynamic notebook from nav * Add more thorough yml config descriptions in docs * Further clean out the config * Semver * Add new blog post * Emphasize yaml * Clarify output * Fix unit test * Fix bullet nesting --- .../patch-20241115203715659970.json | 4 + docs/blog_posts.md | 7 + docs/config/custom.md | 162 ------------- docs/config/overview.md | 6 +- docs/config/template.md | 167 ------------- docs/config/{json_yaml.md => yaml.md} | 221 ++++++++++++------ docs/get_started.md | 2 +- docs/prompt_tuning/manual_prompt_tuning.md | 70 ++++-- docs/prompt_tuning/overview.md | 9 +- docs/query/drift_search.md | 2 +- graphrag/cli/initialize.py | 2 +- graphrag/config/defaults.py | 2 +- graphrag/config/init_content.py | 141 +++++++++++ graphrag/config/models/__init__.py | 2 +- ...drift_config.py => drift_search_config.py} | 0 graphrag/config/models/graph_rag_config.py | 2 +- graphrag/index/init_content.py | 210 ----------------- .../drift_search/drift_context.py | 2 +- .../structured_search/drift_search/primer.py | 2 +- .../structured_search/drift_search/search.py | 2 +- mkdocs.yaml | 16 +- tests/unit/indexing/test_init_content.py | 2 +- v1-breaking-changes.md | 2 +- 23 files changed, 377 insertions(+), 658 deletions(-) create mode 100644 .semversioner/next-release/patch-20241115203715659970.json delete mode 100644 docs/config/custom.md delete mode 100644 docs/config/template.md rename docs/config/{json_yaml.md => yaml.md} (55%) create mode 100644 graphrag/config/init_content.py rename graphrag/config/models/{drift_config.py => drift_search_config.py} (100%) delete mode 100644 graphrag/index/init_content.py diff --git a/.semversioner/next-release/patch-20241115203715659970.json b/.semversioner/next-release/patch-20241115203715659970.json new file mode 100644 index 0000000000..dcbb6d12bf --- /dev/null +++ b/.semversioner/next-release/patch-20241115203715659970.json @@ -0,0 +1,4 @@ +{ + "type": "patch", + "description": "Config and docs updates." +} diff --git a/docs/blog_posts.md b/docs/blog_posts.md index 750c0afebb..b2467a1371 100644 --- a/docs/blog_posts.md +++ b/docs/blog_posts.md @@ -31,4 +31,11 @@ By Julian Whiting, Senior Machine Learning Engineer; Zachary Hills , Senior Software Engineer; [Alonso Guevara Fernández](https://www.microsoft.com/en-us/research/people/alonsog/), Sr. Software Engineer; [Ha Trinh](https://www.microsoft.com/en-us/research/people/trinhha/), Senior Data Scientist; Adam Bradley , Managing Partner, Strategic Research; [Jonathan Larson](https://www.microsoft.com/en-us/research/people/jolarso/), Senior Principal Data Architect +- [:octicons-arrow-right-24: __GraphRAG: Improving global search via dynamic community selection__](https://www.microsoft.com/en-us/research/blog/graphrag-improving-global-search-via-dynamic-community-selection/) + + --- +
Published November 15, 2024 + + By Bryan Li, Research Intern; [Ha Trinh](https://www.microsoft.com/en-us/research/people/trinhha/), Senior Data Scientist; [Darren Edge](https://www.microsoft.com/en-us/research/people/daedge/), Senior Director; [Jonathan Larson](https://www.microsoft.com/en-us/research/people/jolarso/), Senior Principal Data Architect
+ \ No newline at end of file diff --git a/docs/config/custom.md b/docs/config/custom.md deleted file mode 100644 index 368dcbad93..0000000000 --- a/docs/config/custom.md +++ /dev/null @@ -1,162 +0,0 @@ -# Fully Custom Config - -The primary configuration sections for Indexing Engine pipelines are described below. Each configuration section can be expressed in Python (for use in Python API mode) as well as YAML, but YAML is show here for brevity. - -Using custom configuration is an advanced use-case. Most users will want to use the [Default Configuration](overview.md) instead. - -## Indexing Engine Examples - -The [examples](https://github.com/microsoft/graphrag/blob/main/examples/) directory contains several examples of how to use the indexing engine with _custom configuration_. - -Most examples include two different forms of running the pipeline, both are contained in the examples `run.py` - -1. Using mostly the Python API -2. Using mostly the a pipeline configuration file - -To run an example: - -- Run `poetry shell` to activate a virtual environment with the required dependencies. -- Run `PYTHONPATH="$(pwd)" python examples/path_to_example/run.py` from the `root` directory. - -For example to run the single_verb example, you would run the following commands: - -```bash -poetry shell -``` - -```sh -PYTHONPATH="$(pwd)" python examples/single_verb/run.py -``` - -# Configuration Sections - -# > extends - -This configuration allows you to extend a base configuration file or files. - -```yaml -# single base -extends: ../base_config.yml -``` - -```yaml -# multiple bases -extends: - - ../base_config.yml - - ../base_config2.yml -``` - -# > root_dir - -This configuration allows you to set the root directory for the pipeline. All data inputs and outputs are assumed to be relative to this path. - -```yaml -root_dir: /workspace/data_project -``` - -# > storage - -This configuration allows you define the output strategy for the pipeline. - -- `type`: The type of storage to use. Options are `file`, `memory`, and `blob` -- `base_dir` (`type: file` only): The base directory to store the data in. This is relative to the config root. -- `connection_string` (`type: blob` only): The connection string to use for blob storage. -- `container_name` (`type: blob` only): The container to use for blob storage. - -# > cache - -This configuration allows you define the cache strategy for the pipeline. - -- `type`: The type of cache to use. Options are `file` and `memory`, and `blob`. -- `base_dir` (`type: file` only): The base directory to store the cache in. This is relative to the config root. -- `connection_string` (`type: blob` only): The connection string to use for blob storage. -- `container_name` (`type: blob` only): The container to use for blob storage. - -# > reporting - -This configuration allows you define the reporting strategy for the pipeline. Report files are generated artifacts that summarize the performance metrics of the pipeline and emit any error messages. - -- `type`: The type of reporting to use. Options are `file`, `memory`, and `blob` -- `base_dir` (`type: file` only): The base directory to store the reports in. This is relative to the config root. -- `connection_string` (`type: blob` only): The connection string to use for blob storage. -- `container_name` (`type: blob` only): The container to use for blob storage. - -# > workflows - -This configuration section defines the workflow DAG for the pipeline. Here we define an array of workflows and express their inter-dependencies in steps: - -- `name`: The name of the workflow. This is used to reference the workflow in other parts of the config. -- `steps`: The DataShaper steps that this workflow comprises. If a step defines an input in the form of `workflow:`, then it is assumed to have a dependency on the output of that workflow. - -```yaml -workflows: - - name: workflow1 - steps: - - verb: derive - args: - column1: "col1" - column2: "col2" - - name: workflow2 - steps: - - verb: derive - args: - column1: "col1" - column2: "col2" - input: - # dependency established here - source: workflow:workflow1 -``` - -# > input - -- `type`: The type of input to use. Options are `file` or `blob`. -- `file_type`: The file type field discriminates between the different input types. Options are `csv` and `text`. -- `base_dir`: The base directory to read the input files from. This is relative to the config file. -- `file_pattern`: A regex to match the input files. The regex must have named groups for each of the fields in the file_filter. -- `post_process`: A DataShaper workflow definition to apply to the input before executing the primary workflow. -- `source_column` (`type: csv` only): The column containing the source/author of the data -- `text_column` (`type: csv` only): The column containing the text of the data -- `timestamp_column` (`type: csv` only): The column containing the timestamp of the data -- `timestamp_format` (`type: csv` only): The format of the timestamp - -```yaml -input: - type: file - file_type: csv - base_dir: ../data/csv # the directory containing the CSV files, this is relative to the config file - file_pattern: '.*[\/](?P[^\/]+)[\/](?P\d{4})-(?P\d{2})-(?P\d{2})_(?P[^_]+)_\d+\.csv$' # a regex to match the CSV files - # An additional file filter which uses the named groups from the file_pattern to further filter the files - # file_filter: - # # source: (source_filter) - # year: (2023) - # month: (06) - # # day: (22) - source_column: "author" # the column containing the source/author of the data - text_column: "message" # the column containing the text of the data - timestamp_column: "date(yyyyMMddHHmmss)" # optional, the column containing the timestamp of the data - timestamp_format: "%Y%m%d%H%M%S" # optional, the format of the timestamp - post_process: # Optional, set of steps to process the data before going into the workflow - - verb: filter - args: - column: "title", - value: "My document" -``` - -```yaml -input: - type: file - file_type: csv - base_dir: ../data/csv # the directory containing the CSV files, this is relative to the config file - file_pattern: '.*[\/](?P[^\/]+)[\/](?P\d{4})-(?P\d{2})-(?P\d{2})_(?P[^_]+)_\d+\.csv$' # a regex to match the CSV files - # An additional file filter which uses the named groups from the file_pattern to further filter the files - # file_filter: - # # source: (source_filter) - # year: (2023) - # month: (06) - # # day: (22) - post_process: # Optional, set of steps to process the data before going into the workflow - - verb: filter - args: - column: "title", - value: "My document" -``` diff --git a/docs/config/overview.md b/docs/config/overview.md index be03e7ef85..7b8a08bf84 100644 --- a/docs/config/overview.md +++ b/docs/config/overview.md @@ -7,9 +7,5 @@ The GraphRAG system is highly configurable. This page provides an overview of th The default configuration mode is the simplest way to get started with the GraphRAG system. It is designed to work out-of-the-box with minimal configuration. The primary configuration sections for the Indexing Engine pipelines are described below. The main ways to set up GraphRAG in Default Configuration mode are via: - [Init command](init.md) (recommended) +- [Using YAML for deeper control](yaml.md) - [Purely using environment variables](env_vars.md) -- [Using JSON or YAML for deeper control](json_yaml.md) - -## Custom Configuration Mode - -Custom configuration mode is an advanced use-case. Most users will want to use the Default Configuration instead. The primary configuration sections for Indexing Engine pipelines are described below. Details about how to use custom configuration are available in the [Custom Configuration Mode](custom.md) documentation. diff --git a/docs/config/template.md b/docs/config/template.md deleted file mode 100644 index cc97b0d616..0000000000 --- a/docs/config/template.md +++ /dev/null @@ -1,167 +0,0 @@ -# Configuration Template - -The following template can be used and stored as a `.env` in the the directory where you're are pointing -the `--root` parameter on your Indexing Pipeline execution. - -For details about how to run the Indexing Pipeline, refer to the [Index CLI](../cli.md) documentation. - -## .env File Template - -Required variables are uncommented. All the optional configuration can be turned on or off as needed. - -### Minimal Configuration - -```bash -# Base LLM Settings -GRAPHRAG_API_KEY="your_api_key" -GRAPHRAG_API_BASE="http://.openai.azure.com" # For Azure OpenAI Users -GRAPHRAG_API_VERSION="api_version" # For Azure OpenAI Users - -# Text Generation Settings -GRAPHRAG_LLM_TYPE="azure_openai_chat" # or openai_chat -GRAPHRAG_LLM_DEPLOYMENT_NAME="gpt-4-turbo-preview" -GRAPHRAG_LLM_MODEL_SUPPORTS_JSON=True - -# Text Embedding Settings -GRAPHRAG_EMBEDDING_TYPE="azure_openai_embedding" # or openai_embedding -GRAPHRAG_LLM_DEPLOYMENT_NAME="text-embedding-3-small" - -# Data Mapping Settings -GRAPHRAG_INPUT_TYPE="text" - -``` - -### Full Configuration - -```bash - -# Required LLM Config - -# Input Data Configuration -GRAPHRAG_INPUT_TYPE="file" - -# Plaintext Input Data Configuration -# GRAPHRAG_INPUT_FILE_PATTERN=.*\.txt - -# Text Input Data Configuration -GRAPHRAG_INPUT_FILE_TYPE="text" -GRAPHRAG_INPUT_FILE_PATTERN=".*\.txt$" -GRAPHRAG_INPUT_SOURCE_COLUMN=source -# GRAPHRAG_INPUT_TIMESTAMP_COLUMN=None -# GRAPHRAG_INPUT_TIMESTAMP_FORMAT=None -# GRAPHRAG_INPUT_TEXT_COLUMN="text" -# GRAPHRAG_INPUT_ATTRIBUTE_COLUMNS=id -# GRAPHRAG_INPUT_TITLE_COLUMN="title" -# GRAPHRAG_INPUT_TYPE="file" -# GRAPHRAG_INPUT_CONNECTION_STRING=None -# GRAPHRAG_INPUT_CONTAINER_NAME=None -# GRAPHRAG_INPUT_BASE_DIR=None - -# Base LLM Settings -GRAPHRAG_API_KEY="your_api_key" -GRAPHRAG_API_BASE="http://.openai.azure.com" # For Azure OpenAI Users -GRAPHRAG_API_VERSION="api_version" # For Azure OpenAI Users -# GRAPHRAG_API_ORGANIZATION=None -# GRAPHRAG_API_PROXY=None - -# Text Generation Settings -# GRAPHRAG_LLM_TYPE=openai_chat -GRAPHRAG_LLM_API_KEY="your_api_key" # If GRAPHRAG_API_KEY is not set -GRAPHRAG_LLM_API_BASE="http://.openai.azure.com" # For Azure OpenAI Users and if GRAPHRAG_API_BASE is not set -GRAPHRAG_LLM_API_VERSION="api_version" # For Azure OpenAI Users and if GRAPHRAG_API_VERSION is not set -GRAPHRAG_LLM_MODEL_SUPPORTS_JSON=True # Suggested by default -# GRAPHRAG_LLM_API_ORGANIZATION=None -# GRAPHRAG_LLM_API_PROXY=None -# GRAPHRAG_LLM_DEPLOYMENT_NAME=None -# GRAPHRAG_LLM_MODEL=gpt-4-turbo-preview -# GRAPHRAG_LLM_MAX_TOKENS=4000 -# GRAPHRAG_LLM_REQUEST_TIMEOUT=180 -# GRAPHRAG_LLM_THREAD_COUNT=50 -# GRAPHRAG_LLM_THREAD_STAGGER=0.3 -# GRAPHRAG_LLM_CONCURRENT_REQUESTS=25 -# GRAPHRAG_LLM_TPM=0 -# GRAPHRAG_LLM_RPM=0 -# GRAPHRAG_LLM_MAX_RETRIES=10 -# GRAPHRAG_LLM_MAX_RETRY_WAIT=10 -# GRAPHRAG_LLM_SLEEP_ON_RATE_LIMIT_RECOMMENDATION=True - -# Text Embedding Settings -# GRAPHRAG_EMBEDDING_TYPE=openai_embedding -GRAPHRAG_EMBEDDING_API_KEY="your_api_key" # If GRAPHRAG_API_KEY is not set -GRAPHRAG_EMBEDDING_API_BASE="http://.openai.azure.com" # For Azure OpenAI Users and if GRAPHRAG_API_BASE is not set -GRAPHRAG_EMBEDDING_API_VERSION="api_version" # For Azure OpenAI Users and if GRAPHRAG_API_VERSION is not set -# GRAPHRAG_EMBEDDING_API_ORGANIZATION=None -# GRAPHRAG_EMBEDDING_API_PROXY=None -# GRAPHRAG_EMBEDDING_DEPLOYMENT_NAME=None -# GRAPHRAG_EMBEDDING_MODEL=text-embedding-3-small -# GRAPHRAG_EMBEDDING_BATCH_SIZE=16 -# GRAPHRAG_EMBEDDING_BATCH_MAX_TOKENS=8191 -# GRAPHRAG_EMBEDDING_TARGET=required -# GRAPHRAG_EMBEDDING_SKIP=None -# GRAPHRAG_EMBEDDING_THREAD_COUNT=None -# GRAPHRAG_EMBEDDING_THREAD_STAGGER=50 -# GRAPHRAG_EMBEDDING_CONCURRENT_REQUESTS=25 -# GRAPHRAG_EMBEDDING_TPM=0 -# GRAPHRAG_EMBEDDING_RPM=0 -# GRAPHRAG_EMBEDDING_MAX_RETRIES=10 -# GRAPHRAG_EMBEDDING_MAX_RETRY_WAIT=10 -# GRAPHRAG_EMBEDDING_SLEEP_ON_RATE_LIMIT_RECOMMENDATION=True - -# Data Mapping Settings -# GRAPHRAG_INPUT_ENCODING=utf-8 - -# Data Chunking -# GRAPHRAG_CHUNK_SIZE=1200 -# GRAPHRAG_CHUNK_OVERLAP=100 -# GRAPHRAG_CHUNK_BY_COLUMNS=id - -# Prompting Overrides -# GRAPHRAG_ENTITY_EXTRACTION_PROMPT_FILE=None -# GRAPHRAG_ENTITY_EXTRACTION_MAX_GLEANINGS=1 -# GRAPHRAG_ENTITY_EXTRACTION_ENTITY_TYPES=organization,person,event,geo -# GRAPHRAG_SUMMARIZE_DESCRIPTIONS_PROMPT_FILE=None -# GRAPHRAG_SUMMARIZE_DESCRIPTIONS_MAX_LENGTH=500 -# GRAPHRAG_CLAIM_EXTRACTION_DESCRIPTION="Any claims or facts that could be relevant to threat analysis." -# GRAPHRAG_CLAIM_EXTRACTION_PROMPT_FILE=None -# GRAPHRAG_CLAIM_EXTRACTION_MAX_GLEANINGS=1 -# GRAPHRAG_COMMUNITY_REPORT_PROMPT_FILE=None -# GRAPHRAG_COMMUNITY_REPORT_MAX_LENGTH=1500 - -# Storage -# GRAPHRAG_STORAGE_TYPE=file -# GRAPHRAG_STORAGE_CONNECTION_STRING=None -# GRAPHRAG_STORAGE_CONTAINER_NAME=None -# GRAPHRAG_STORAGE_BASE_DIR=None - -# Cache -# GRAPHRAG_CACHE_TYPE=file -# GRAPHRAG_CACHE_CONNECTION_STRING=None -# GRAPHRAG_CACHE_CONTAINER_NAME=None -# GRAPHRAG_CACHE_BASE_DIR=None - -# Reporting -# GRAPHRAG_REPORTING_TYPE=file -# GRAPHRAG_REPORTING_CONNECTION_STRING=None -# GRAPHRAG_REPORTING_CONTAINER_NAME=None -# GRAPHRAG_REPORTING_BASE_DIR=None - -# Node2Vec Parameters -# GRAPHRAG_NODE2VEC_ENABLED=False -# GRAPHRAG_NODE2VEC_NUM_WALKS=10 -# GRAPHRAG_NODE2VEC_WALK_LENGTH=40 -# GRAPHRAG_NODE2VEC_WINDOW_SIZE=2 -# GRAPHRAG_NODE2VEC_ITERATIONS=3 -# GRAPHRAG_NODE2VEC_RANDOM_SEED=597832 - -# Data Snapshotting -# GRAPHRAG_SNAPSHOT_GRAPHML=False -# GRAPHRAG_SNAPSHOT_RAW_ENTITIES=False -# GRAPHRAG_SNAPSHOT_TOP_LEVEL_NODES=False - -# Miscellaneous Settings -# GRAPHRAG_ASYNC_MODE=asyncio -# GRAPHRAG_ENCODING_MODEL=cl100k_base -# GRAPHRAG_MAX_CLUSTER_SIZE=10 -# GRAPHRAG_SKIP_WORKFLOWS=None -# GRAPHRAG_UMAP_ENABLED=False -``` diff --git a/docs/config/json_yaml.md b/docs/config/yaml.md similarity index 55% rename from docs/config/json_yaml.md rename to docs/config/yaml.md index ba3661308e..5b4012570c 100644 --- a/docs/config/json_yaml.md +++ b/docs/config/yaml.md @@ -1,47 +1,29 @@ -# Default Configuration Mode (using JSON/YAML) +# Default Configuration Mode (using YAML/JSON) -The default configuration mode may be configured by using a `settings.json` or `settings.yml` file in the data project root. If a `.env` file is present along with this config file, then it will be loaded, and the environment variables defined therein will be available for token replacements in your configuration document using `${ENV_VAR}` syntax. +The default configuration mode may be configured by using a `settings.yml` or `settings.json` file in the data project root. If a `.env` file is present along with this config file, then it will be loaded, and the environment variables defined therein will be available for token replacements in your configuration document using `${ENV_VAR}` syntax. We initialize with YML by default in `graphrag init` but you may use the equivalent JSON form if preferred. + +Many of these config values have defaults. Rather than replicate them here, please refer to the [constants in the code](https://github.com/microsoft/graphrag/blob/main/graphrag/config/defaults.py) directly. For example: ``` # .env -API_KEY=some_api_key - -# settings.json -{ - "llm": { - "api_key": "${API_KEY}" - } -} +GRAPHRAG_API_KEY=some_api_key + +# settings.yml +llm: + api_key: ${GRAPHRAG_API_KEY} ``` # Config Sections -## input - -### Fields - -- `type` **file|blob** - The input type to use. Default=`file` -- `file_type` **text|csv** - The type of input data to load. Either `text` or `csv`. Default is `text` -- `file_encoding` **str** - The encoding of the input file. Default is `utf-8` -- `file_pattern` **str** - A regex to match input files. Default is `.*\.csv$` if in csv mode and `.*\.txt$` if in text mode. -- `source_column` **str** - (CSV Mode Only) The source column name. -- `timestamp_column` **str** - (CSV Mode Only) The timestamp column name. -- `timestamp_format` **str** - (CSV Mode Only) The source format. -- `text_column` **str** - (CSV Mode Only) The text column name. -- `title_column` **str** - (CSV Mode Only) The title column name. -- `document_attribute_columns` **list[str]** - (CSV Mode Only) The additional document attributes to include. -- `connection_string` **str** - (blob only) The Azure Storage connection string. -- `container_name` **str** - (blob only) The Azure Storage container name. -- `base_dir` **str** - The base directory to read input from, relative to the root. -- `storage_account_blob_url` **str** - The storage account blob URL to use. +## Indexing -## llm +### llm This is the base LLM configuration section. Other steps may override this configuration with their own LLM configuration. -### Fields +#### Fields - `api_key` **str** - The OpenAI API key to use. - `type` **openai_chat|azure_openai_chat|openai_embedding|azure_openai_embedding** - The type of LLM to use. @@ -65,20 +47,20 @@ This is the base LLM configuration section. Other steps may override this config - `top_p` **float** - The top-p value to use. - `n` **int** - The number of completions to generate. -## parallelization +### parallelization -### Fields +#### Fields - `stagger` **float** - The threading stagger value. - `num_threads` **int** - The maximum number of work threads. -## async_mode +### async_mode **asyncio|threaded** The async mode to use. Either `asyncio` or `threaded. -## embeddings +### embeddings -### Fields +#### Fields - `llm` (see LLM top-level config) - `parallelization` (see Parallelization top-level config) @@ -88,18 +70,38 @@ This is the base LLM configuration section. Other steps may override this config - `target` **required|all|none** - Determines which set of embeddings to emit. - `skip` **list[str]** - Which embeddings to skip. Only useful if target=all to customize the list. - `vector_store` **dict** - The vector store to use. Configured for lancedb by default. - - `type` **str** - `lancedb` or `azure_ai_search`. Default=`lancedb` - - `db_uri` **str** (only for lancedb) - The database uri. Default=`storage.base_dir/lancedb` - - `url` **str** (only for AI Search) - AI Search endpoint - - `api_key` **str** (optional - only for AI Search) - The AI Search api key to use. - - `audience` **str** (only for AI Search) - Audience for managed identity token if managed identity authentication is used. - - `overwrite` **bool** (only used at index creation time) - Overwrite collection if it exist. Default=`True` - - `container_name` **str** - The name of a vector container. This stores all indexes (tables) for a given dataset ingest. Default=`default` + - `type` **str** - `lancedb` or `azure_ai_search`. Default=`lancedb` + - `db_uri` **str** (only for lancedb) - The database uri. Default=`storage.base_dir/lancedb` + - `url` **str** (only for AI Search) - AI Search endpoint + - `api_key` **str** (optional - only for AI Search) - The AI Search api key to use. + - `audience` **str** (only for AI Search) - Audience for managed identity token if managed identity authentication is used. + - `overwrite` **bool** (only used at index creation time) - Overwrite collection if it exist. Default=`True` + - `container_name` **str** - The name of a vector container. This stores all indexes (tables) for a given dataset ingest. Default=`default` - `strategy` **dict** - Fully override the text-embedding strategy. -## chunks +### input -### Fields +#### Fields + +- `type` **file|blob** - The input type to use. Default=`file` +- `file_type` **text|csv** - The type of input data to load. Either `text` or `csv`. Default is `text` +- `base_dir` **str** - The base directory to read input from, relative to the root. +- `connection_string` **str** - (blob only) The Azure Storage connection string. +- `storage_account_blob_url` **str** - The storage account blob URL to use. +- `container_name` **str** - (blob only) The Azure Storage container name. +- `file_encoding` **str** - The encoding of the input file. Default is `utf-8` +- `file_pattern` **str** - A regex to match input files. Default is `.*\.csv$` if in csv mode and `.*\.txt$` if in text mode. +- `file_filter` **dict** - Key/value pairs to filter. Default is None. +- `source_column` **str** - (CSV Mode Only) The source column name. +- `timestamp_column` **str** - (CSV Mode Only) The timestamp column name. +- `timestamp_format` **str** - (CSV Mode Only) The source format. +- `text_column` **str** - (CSV Mode Only) The text column name. +- `title_column` **str** - (CSV Mode Only) The title column name. +- `document_attribute_columns` **list[str]** - (CSV Mode Only) The additional document attributes to include. + +### chunks + +#### Fields - `size` **int** - The max chunk size in tokens. - `overlap` **int** - The chunk overlap in tokens. @@ -107,9 +109,9 @@ This is the base LLM configuration section. Other steps may override this config - `encoding_model` **str** - The text encoding model to use. Default is to use the top-level encoding model. - `strategy` **dict** - Fully override the chunking strategy. -## cache +### cache -### Fields +#### Fields - `type` **file|memory|none|blob** - The cache type to use. Default=`file` - `connection_string` **str** - (blob only) The Azure Storage connection string. @@ -117,19 +119,29 @@ This is the base LLM configuration section. Other steps may override this config - `base_dir` **str** - The base directory to write cache to, relative to the root. - `storage_account_blob_url` **str** - The storage account blob URL to use. -## storage +### storage -### Fields +#### Fields - `type` **file|memory|blob** - The storage type to use. Default=`file` - `connection_string` **str** - (blob only) The Azure Storage connection string. - `container_name` **str** - (blob only) The Azure Storage container name. -- `base_dir` **str** - The base directory to write reports to, relative to the root. +- `base_dir` **str** - The base directory to write output artifacts to, relative to the root. - `storage_account_blob_url` **str** - The storage account blob URL to use. -## reporting +### update_index_storage -### Fields +#### Fields + +- `type` **file|memory|blob** - The storage type to use. Default=`file` +- `connection_string` **str** - (blob only) The Azure Storage connection string. +- `container_name` **str** - (blob only) The Azure Storage container name. +- `base_dir` **str** - The base directory to write output artifacts to, relative to the root. +- `storage_account_blob_url` **str** - The storage account blob URL to use. + +### reporting + +#### Fields - `type` **file|console|blob** - The reporting type to use. Default=`file` - `connection_string` **str** - (blob only) The Azure Storage connection string. @@ -137,9 +149,9 @@ This is the base LLM configuration section. Other steps may override this config - `base_dir` **str** - The base directory to write reports to, relative to the root. - `storage_account_blob_url` **str** - The storage account blob URL to use. -## entity_extraction +### entity_extraction -### Fields +#### Fields - `llm` (see LLM top-level config) - `parallelization` (see Parallelization top-level config) @@ -150,9 +162,9 @@ This is the base LLM configuration section. Other steps may override this config - `encoding_model` **str** - The text encoding model to use. By default, this will use the top-level encoding model. - `strategy` **dict** - Fully override the entity extraction strategy. -## summarize_descriptions +### summarize_descriptions -### Fields +#### Fields - `llm` (see LLM top-level config) - `parallelization` (see Parallelization top-level config) @@ -161,11 +173,11 @@ This is the base LLM configuration section. Other steps may override this config - `max_length` **int** - The maximum number of output tokens per summarization. - `strategy` **dict** - Fully override the summarize description strategy. -## claim_extraction +### claim_extraction -### Fields +#### Fields -- `enabled` **bool** - Whether to enable claim extraction. default=False +- `enabled` **bool** - Whether to enable claim extraction. Off by default, because claim prompts really need user tuning. - `llm` (see LLM top-level config) - `parallelization` (see Parallelization top-level config) - `async_mode` (see Async Mode top-level config) @@ -175,9 +187,9 @@ This is the base LLM configuration section. Other steps may override this config - `encoding_model` **str** - The text encoding model to use. By default, this will use the top-level encoding model. - `strategy` **dict** - Fully override the claim extraction strategy. -## community_reports +### community_reports -### Fields +#### Fields - `llm` (see LLM top-level config) - `parallelization` (see Parallelization top-level config) @@ -187,16 +199,16 @@ This is the base LLM configuration section. Other steps may override this config - `max_input_length` **int** - The maximum number of input tokens to use when generating reports. - `strategy` **dict** - Fully override the community reports strategy. -## cluster_graph +### cluster_graph -### Fields +#### Fields - `max_cluster_size` **int** - The maximum cluster size to emit. - `strategy` **dict** - Fully override the cluster_graph strategy. -## embed_graph +### embed_graph -### Fields +#### Fields - `enabled` **bool** - Whether to enable graph embeddings. - `num_walks` **int** - The node2vec number of walks. @@ -206,15 +218,15 @@ This is the base LLM configuration section. Other steps may override this config - `random_seed` **int** - The node2vec random seed. - `strategy` **dict** - Fully override the embed graph strategy. -## umap +### umap -### Fields +#### Fields - `enabled` **bool** - Whether to enable UMAP layouts. -## snapshots +### snapshots -### Fields +#### Fields - `embeddings` **bool** - Emit embeddings snapshots to parquet. - `graphml` **bool** - Emit graph snapshots to GraphML. @@ -222,10 +234,79 @@ This is the base LLM configuration section. Other steps may override this config - `top_level_nodes` **bool** - Emit top-level-node snapshots to JSON. - `transient` **bool** - Emit transient workflow tables snapshots to parquet. -## encoding_model +### encoding_model **str** - The text encoding model to use. Default=`cl100k_base`. -## skip_workflows +### skip_workflows **list[str]** - Which workflow names to skip. + +## Query + +### local_search + +#### Fields + +- `prompt` **str** - The prompt file to use. +- `text_unit_prop` **float** - The text unit proportion. +- `community_prop` **float** - The community proportion. +- `conversation_history_max_turns` **int** - The conversation history maximum turns. +- `top_k_entities` **int** - The top k mapped entities. +- `top_k_relationships` **int** - The top k mapped relations. +- `temperature` **float | None** - The temperature to use for token generation. +- `top_p` **float | None** - The top-p value to use for token generation. +- `n` **int | None** - The number of completions to generate. +- `max_tokens` **int** - The maximum tokens. +- `llm_max_tokens` **int** - The LLM maximum tokens. + +### global_search + +#### Fields + +- `map_prompt` **str** - The mapper prompt file to use. +- `reduce_prompt` **str** - The reducer prompt file to use. +- `knowledge_prompt` **str** - The knowledge prompt file to use. +- `map_prompt` **str | None** - The global search mapper prompt to use. +- `reduce_prompt` **str | None** - The global search reducer to use. +- `knowledge_prompt` **str | None** - The global search general prompt to use. +- `temperature` **float | None** - The temperature to use for token generation. +- `top_p` **float | None** - The top-p value to use for token generation. +- `n` **int | None** - The number of completions to generate. +- `max_tokens` **int** - The maximum context size in tokens. +- `data_max_tokens` **int** - The data llm maximum tokens. +- `map_max_tokens` **int** - The map llm maximum tokens. +- `reduce_max_tokens` **int** - The reduce llm maximum tokens. +- `concurrency` **int** - The number of concurrent requests. +- `dynamic_search_llm` **str** - LLM model to use for dynamic community selection. +- `dynamic_search_threshold` **int** - Rating threshold in include a community report. +- `dynamic_search_keep_parent` **bool** - Keep parent community if any of the child communities are relevant. +- `dynamic_search_num_repeats` **int** - Number of times to rate the same community report. +- `dynamic_search_use_summary` **bool** - Use community summary instead of full_context. +- `dynamic_search_concurrent_coroutines` **int** - Number of concurrent coroutines to rate community reports. +- `dynamic_search_max_level` **int** - The maximum level of community hierarchy to consider if none of the processed communities are relevant. + +### drift_search + +#### Fields + +- `prompt` **str** - The prompt file to use. +- `temperature` **float** - The temperature to use for token generation.", +- `top_p` **float** - The top-p value to use for token generation. +- `n` **int** - The number of completions to generate. +- `max_tokens` **int** - The maximum context size in tokens. +- `data_max_tokens` **int** - The data llm maximum tokens. +- `concurrency` **int** - The number of concurrent requests. +- `drift_k_followups` **int** - The number of top global results to retrieve. +- `primer_folds` **int** - The number of folds for search priming. +- `primer_llm_max_tokens` **int** - The maximum number of tokens for the LLM in primer. +- `n_depth` **int** - The number of drift search steps to take. +- `local_search_text_unit_prop` **float** - The proportion of search dedicated to text units. +- `local_search_community_prop` **float** - The proportion of search dedicated to community properties. +- `local_search_top_k_mapped_entities` **int** - The number of top K entities to map during local search. +- `local_search_top_k_relationships` **int** - The number of top K relationships to map during local search. +- `local_search_max_data_tokens` **int** - The maximum context size in tokens for local search. +- `local_search_temperature` **float** - The temperature to use for token generation in local search. +- `local_search_top_p` **float** - The top-p value to use for token generation in local search. +- `local_search_n` **int** - The number of completions to generate in local search. +- `local_search_llm_max_gen_tokens` **int** - The maximum number of generated tokens for the LLM in local search. diff --git a/docs/get_started.md b/docs/get_started.md index 63f485a546..4eb2f2887f 100644 --- a/docs/get_started.md +++ b/docs/get_started.md @@ -34,7 +34,7 @@ The graphrag library includes a CLI for a no-code approach to getting started. P # Running the Indexer -Now we need to set up a data project and some initial configuration. Let's set that up. We're using the [default configuration mode](config/overview.md), which you can customize as needed using a [config file](config/json_yaml.md), which we recommend, or [environment variables](config/env_vars.md). +Now we need to set up a data project and some initial configuration. Let's set that up. We're using the [default configuration mode](config/overview.md), which you can customize as needed using a [config file](config/yaml.md), which we recommend, or [environment variables](config/env_vars.md). First let's get a sample dataset ready: diff --git a/docs/prompt_tuning/manual_prompt_tuning.md b/docs/prompt_tuning/manual_prompt_tuning.md index 7f10fc8e79..4dc90b893b 100644 --- a/docs/prompt_tuning/manual_prompt_tuning.md +++ b/docs/prompt_tuning/manual_prompt_tuning.md @@ -6,11 +6,13 @@ We provide a means for you to do this by allowing you to specify a custom prompt Each of these prompts may be overridden by writing a custom prompt file in plaintext. We use token-replacements in the form of `{token_name}`, and the descriptions for the available tokens can be found below. -## Entity/Relationship Extraction +## Indexing Prompts -[Prompt Source](http://github.com/microsoft/graphrag/blob/main/graphrag/prompts/entity_extraction.py) +### Entity/Relationship Extraction -### Tokens (values provided by extractor) +[Prompt Source](http://github.com/microsoft/graphrag/blob/main/graphrag/prompts/index/entity_extraction.py) + +#### Tokens - **{input_text}** - The input text to be processed. - **{entity_types}** - A list of entity types @@ -18,37 +20,71 @@ Each of these prompts may be overridden by writing a custom prompt file in plain - **{record_delimiter}** - A delimiter for separating tuple instances. - **{completion_delimiter}** - An indicator for when generation is complete. -## Summarize Entity/Relationship Descriptions +### Summarize Entity/Relationship Descriptions -[Prompt Source](http://github.com/microsoft/graphrag/blob/main/graphrag/prompts/summarize_descriptions.py) +[Prompt Source](http://github.com/microsoft/graphrag/blob/main/graphrag/prompts/index/summarize_descriptions.py) -### Tokens (values provided by extractor) +#### Tokens - **{entity_name}** - The name of the entity or the source/target pair of the relationship. - **{description_list}** - A list of descriptions for the entity or relationship. -## Claim Extraction +### Claim Extraction -[Prompt Source](http://github.com/microsoft/graphrag/blob/main/graphrag/prompts/claim_extraction.py) +[Prompt Source](http://github.com/microsoft/graphrag/blob/main/graphrag/prompts/index/claim_extraction.py) -### Tokens (values provided by extractor) +#### Tokens - **{input_text}** - The input text to be processed. - **{tuple_delimiter}** - A delimiter for separating values within a tuple. A single tuple is used to represent an individual entity or relationship. - **{record_delimiter}** - A delimiter for separating tuple instances. - **{completion_delimiter}** - An indicator for when generation is complete. - -Note: there is additional parameter for the `Claim Description` that is used in claim extraction. -The default value is - -`"Any claims or facts that could be relevant to information discovery."` +- **{entity_specs}** - A list of entity types. +- **{claim_description}** - Description of what claims should look like. Default is: `"Any claims or facts that could be relevant to information discovery."` See the [configuration documentation](../config/overview.md) for details on how to change this. -## Generate Community Reports +### Generate Community Reports -[Prompt Source](http://github.com/microsoft/graphrag/blob/main/graphrag/prompts/community_report.py) +[Prompt Source](http://github.com/microsoft/graphrag/blob/main/graphrag/prompts/index/community_report.py) -### Tokens (values provided by extractor) +#### Tokens - **{input_text}** - The input text to generate the report with. This will contain tables of entities and relationships. + +## Query Prompts + +### Local Search + +[Prompt Source](http://github.com/microsoft/graphrag/blob/main/graphrag/prompts/query/local_search_system_prompt.py) + +#### Tokens + +- **{response_type}** - Describe how the response should look. We default to "multiple paragraphs". +- **{context_data}** - The data tables from GraphRAG's index. + +### Global Search + +[Mapper Prompt Source](http://github.com/microsoft/graphrag/blob/main/graphrag/prompts/query/global_search_map_system_prompt.py) + +[Reducer Prompt Source](http://github.com/microsoft/graphrag/blob/main/graphrag/prompts/query/global_search_reduce_system_prompt.py) + +[Knowledge Prompt Source](http://github.com/microsoft/graphrag/blob/main/graphrag/prompts/query/global_search_knowledge_system_prompt.py) + +Global search uses a map/reduce approach to summarization. You can tune these prompts independently. This search also includes the ability to adjust the use of general knowledge from the model's training. + +#### Tokens + +- **{response_type}** - Describe how the response should look (reducer only). We default to "multiple paragraphs". +- **{context_data}** - The data tables from GraphRAG's index. + +### Drift Search + +[Prompt Source](http://github.com/microsoft/graphrag/blob/main/graphrag/prompts/query/drift_search_system_prompt.py) + +#### Tokens + +- **{response_type}** - Describe how the response should look. We default to "multiple paragraphs". +- **{context_data}** - The data tables from GraphRAG's index. +- **{community_reports}** - The most relevant community reports to include in the summarization. +- **{query}** - The query text as injected into the context. \ No newline at end of file diff --git a/docs/prompt_tuning/overview.md b/docs/prompt_tuning/overview.md index fe25a8eddf..a12cebb257 100644 --- a/docs/prompt_tuning/overview.md +++ b/docs/prompt_tuning/overview.md @@ -4,12 +4,7 @@ This page provides an overview of the prompt tuning options available for the Gr ## Default Prompts -The default prompts are the simplest way to get started with the GraphRAG system. It is designed to work out-of-the-box with minimal configuration. You can find more detail about these prompts in the following links: - -- [Entity/Relationship Extraction](http://github.com/microsoft/graphrag/blob/main/graphrag/index/graph/extractors/graph/prompts.py) -- [Entity/Relationship Description Summarization](http://github.com/microsoft/graphrag/blob/main/graphrag/index/graph/extractors/summarize/prompts.py) -- [Claim Extraction](http://github.com/microsoft/graphrag/blob/main/graphrag/index/graph/extractors/claims/prompts.py) -- [Community Reports](http://github.com/microsoft/graphrag/blob/main/graphrag/index/graph/extractors/community_reports/prompts.py) +The default prompts are the simplest way to get started with the GraphRAG system. It is designed to work out-of-the-box with minimal configuration. More details about each of the default prompts for indexing and query can be found on the [manual tuning](./manual_prompt_tuning.md) page. ## Auto Tuning @@ -17,4 +12,4 @@ Auto Tuning leverages your input data and LLM interactions to create domain adap ## Manual Tuning -Manual tuning is an advanced use-case. Most users will want to use the Auto Tuning feature instead. Details about how to use manual configuration are available in the [Manual Tuning](manual_prompt_tuning.md) documentation. +Manual tuning is an advanced use-case. Most users will want to use the Auto Tuning feature instead. Details about how to use manual configuration are available in the [manual tuning](manual_prompt_tuning.md) documentation. diff --git a/docs/query/drift_search.md b/docs/query/drift_search.md index 1f72adf39f..e6199221ed 100644 --- a/docs/query/drift_search.md +++ b/docs/query/drift_search.md @@ -23,7 +23,7 @@ Below are the key parameters of the [DRIFTSearch class](https://github.com/micro - `llm`: OpenAI model object to be used for response generation - `context_builder`: [context builder](https://github.com/microsoft/graphrag/blob/main/graphrag/query/structured_search/drift_search/drift_context.py) object to be used for preparing context data from community reports and query information -- `config`: model to define the DRIFT Search hyperparameters. [DRIFT Config model](https://github.com/microsoft/graphrag/blob/main/graphrag/config/models/drift_config.py) +- `config`: model to define the DRIFT Search hyperparameters. [DRIFT Config model](https://github.com/microsoft/graphrag/blob/main/graphrag/config/models/drift_search_config.py) - `token_encoder`: token encoder for tracking the budget for the algorithm. - `query_state`: a state object as defined in [Query State](https://github.com/microsoft/graphrag/blob/main/graphrag/query/structured_search/drift_search/state.py) that allows to track execution of a DRIFT Search instance, alongside follow ups and [DRIFT actions](https://github.com/microsoft/graphrag/blob/main/graphrag/query/structured_search/drift_search/action.py). diff --git a/graphrag/cli/initialize.py b/graphrag/cli/initialize.py index c807397799..46bf6167df 100644 --- a/graphrag/cli/initialize.py +++ b/graphrag/cli/initialize.py @@ -5,7 +5,7 @@ from pathlib import Path -from graphrag.index.init_content import INIT_DOTENV, INIT_YAML +from graphrag.config.init_content import INIT_DOTENV, INIT_YAML from graphrag.logging import ReporterType, create_progress_reporter from graphrag.prompts.index.claim_extraction import CLAIM_EXTRACTION_PROMPT from graphrag.prompts.index.community_report import ( diff --git a/graphrag/config/defaults.py b/graphrag/config/defaults.py index 262705b3dd..41ec8fc892 100644 --- a/graphrag/config/defaults.py +++ b/graphrag/config/defaults.py @@ -93,7 +93,7 @@ VECTOR_STORE = f""" type: {VectorStoreType.LanceDB.value} db_uri: '{(Path(STORAGE_BASE_DIR) / "lancedb")!s}' - container_name: default # A prefix for the vector store to create embedding containers. Default: 'default'. + container_name: default overwrite: true\ """ diff --git a/graphrag/config/init_content.py b/graphrag/config/init_content.py new file mode 100644 index 0000000000..5056428dc6 --- /dev/null +++ b/graphrag/config/init_content.py @@ -0,0 +1,141 @@ +# Copyright (c) 2024 Microsoft Corporation. +# Licensed under the MIT License + +"""Content for the init CLI command to generate a default configuration.""" + +import graphrag.config.defaults as defs + +INIT_YAML = f"""\ +### This config file contains required core defaults that must be set, along with a handful of common optional settings. +### For a full list of available settings, see https://microsoft.github.io/graphrag/config/yaml/ + +### LLM settings ### +## There are a number of settings to tune the threading and token limits for LLM calls - check the docs. + +encoding_model: cl100k_base # this needs to be matched to your model! + +llm: + api_key: ${{GRAPHRAG_API_KEY}} # set this in the generated .env file + type: {defs.LLM_TYPE.value} # or azure_openai_chat + model: {defs.LLM_MODEL} + model_supports_json: true # recommended if this is available for your model. + # audience: "https://cognitiveservices.azure.com/.default" + # api_base: https://.openai.azure.com + # api_version: 2024-02-15-preview + # organization: + # deployment_name: + +parallelization: + stagger: {defs.PARALLELIZATION_STAGGER} + # num_threads: {defs.PARALLELIZATION_NUM_THREADS} + +async_mode: {defs.ASYNC_MODE.value} # or asyncio + +embeddings: + async_mode: {defs.ASYNC_MODE.value} # or asyncio + vector_store:{defs.VECTOR_STORE} + llm: + api_key: ${{GRAPHRAG_API_KEY}} + type: {defs.EMBEDDING_TYPE.value} # or azure_openai_embedding + model: {defs.EMBEDDING_MODEL} + # api_base: https://.openai.azure.com + # api_version: 2024-02-15-preview + # audience: "https://cognitiveservices.azure.com/.default" + # organization: + # deployment_name: + +### Input settings ### + +input: + type: {defs.INPUT_TYPE.value} # or blob + file_type: {defs.INPUT_FILE_TYPE.value} # or csv + base_dir: "{defs.INPUT_BASE_DIR}" + file_encoding: {defs.INPUT_FILE_ENCODING} + file_pattern: ".*\\\\.txt$" + +chunks: + size: {defs.CHUNK_SIZE} + overlap: {defs.CHUNK_OVERLAP} + group_by_columns: [{",".join(defs.CHUNK_GROUP_BY_COLUMNS)}] + +### Storage settings ### +## If blob storage is specified in the following four sections, +## connection_string and container_name must be provided + +cache: + type: {defs.CACHE_TYPE.value} # or blob + base_dir: "{defs.CACHE_BASE_DIR}" + +reporting: + type: {defs.REPORTING_TYPE.value} # or console, blob + base_dir: "{defs.REPORTING_BASE_DIR}" + +storage: + type: {defs.STORAGE_TYPE.value} # or blob + base_dir: "{defs.STORAGE_BASE_DIR}" + +## only turn this on if running `graphrag index` with custom settings +## we normally use `graphrag update` with the defaults +update_index_storage: + # type: {defs.STORAGE_TYPE.value} # or blob + # base_dir: "{defs.UPDATE_STORAGE_BASE_DIR}" + +### Workflow settings ### + +skip_workflows: [] + +entity_extraction: + prompt: "prompts/entity_extraction.txt" + entity_types: [{",".join(defs.ENTITY_EXTRACTION_ENTITY_TYPES)}] + max_gleanings: {defs.ENTITY_EXTRACTION_MAX_GLEANINGS} + +summarize_descriptions: + prompt: "prompts/summarize_descriptions.txt" + max_length: {defs.SUMMARIZE_DESCRIPTIONS_MAX_LENGTH} + +claim_extraction: + enabled: false + prompt: "prompts/claim_extraction.txt" + description: "{defs.CLAIM_DESCRIPTION}" + max_gleanings: {defs.CLAIM_MAX_GLEANINGS} + +community_reports: + prompt: "prompts/community_report.txt" + max_length: {defs.COMMUNITY_REPORT_MAX_LENGTH} + max_input_length: {defs.COMMUNITY_REPORT_MAX_INPUT_LENGTH} + +cluster_graph: + max_cluster_size: {defs.MAX_CLUSTER_SIZE} + +embed_graph: + enabled: false # if true, will generate node2vec embeddings for nodes + +umap: + enabled: false # if true, will generate UMAP embeddings for nodes + +snapshots: + graphml: false + raw_entities: false + top_level_nodes: false + embeddings: false + transient: false + +### Query settings ### +## The prompt locations are required here, but each search method has a number of optional knobs that can be tuned. +## See the config docs: https://microsoft.github.io/graphrag/config/yaml/#query + +local_search: + prompt: "prompts/local_search_system_prompt.txt" + +global_search: + map_prompt: "prompts/global_search_map_system_prompt.txt" + reduce_prompt: "prompts/global_search_reduce_system_prompt.txt" + knowledge_prompt: "prompts/global_search_knowledge_system_prompt.txt" + +drift_search: + prompt: "prompts/drift_search_system_prompt.txt" +""" + +INIT_DOTENV = """\ +GRAPHRAG_API_KEY= +""" diff --git a/graphrag/config/models/__init__.py b/graphrag/config/models/__init__.py index 68691cc6ee..887d4ad653 100644 --- a/graphrag/config/models/__init__.py +++ b/graphrag/config/models/__init__.py @@ -8,7 +8,7 @@ from .claim_extraction_config import ClaimExtractionConfig from .cluster_graph_config import ClusterGraphConfig from .community_reports_config import CommunityReportsConfig -from .drift_config import DRIFTSearchConfig +from .drift_search_config import DRIFTSearchConfig from .embed_graph_config import EmbedGraphConfig from .entity_extraction_config import EntityExtractionConfig from .global_search_config import GlobalSearchConfig diff --git a/graphrag/config/models/drift_config.py b/graphrag/config/models/drift_search_config.py similarity index 100% rename from graphrag/config/models/drift_config.py rename to graphrag/config/models/drift_search_config.py diff --git a/graphrag/config/models/graph_rag_config.py b/graphrag/config/models/graph_rag_config.py index 35912e12f5..adcf64452d 100644 --- a/graphrag/config/models/graph_rag_config.py +++ b/graphrag/config/models/graph_rag_config.py @@ -13,7 +13,7 @@ from .claim_extraction_config import ClaimExtractionConfig from .cluster_graph_config import ClusterGraphConfig from .community_reports_config import CommunityReportsConfig -from .drift_config import DRIFTSearchConfig +from .drift_search_config import DRIFTSearchConfig from .embed_graph_config import EmbedGraphConfig from .entity_extraction_config import EntityExtractionConfig from .global_search_config import GlobalSearchConfig diff --git a/graphrag/index/init_content.py b/graphrag/index/init_content.py deleted file mode 100644 index 93807672f2..0000000000 --- a/graphrag/index/init_content.py +++ /dev/null @@ -1,210 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""Content for the init CLI command to generate a default configuration.""" - -import graphrag.config.defaults as defs - -INIT_YAML = f"""\ -encoding_model: cl100k_base -skip_workflows: [] -llm: - api_key: ${{GRAPHRAG_API_KEY}} - type: {defs.LLM_TYPE.value} # or azure_openai_chat - model: {defs.LLM_MODEL} - model_supports_json: true # recommended if this is available for your model. - # audience: "https://cognitiveservices.azure.com/.default" - # max_tokens: {defs.LLM_MAX_TOKENS} - # request_timeout: {defs.LLM_REQUEST_TIMEOUT} - # api_base: https://.openai.azure.com - # api_version: 2024-02-15-preview - # organization: - # deployment_name: - # tokens_per_minute: 150_000 # set a leaky bucket throttle - # requests_per_minute: 10_000 # set a leaky bucket throttle - # max_retries: {defs.LLM_MAX_RETRIES} - # max_retry_wait: {defs.LLM_MAX_RETRY_WAIT} - # sleep_on_rate_limit_recommendation: true # whether to sleep when azure suggests wait-times - # concurrent_requests: {defs.LLM_CONCURRENT_REQUESTS} # the number of parallel inflight requests that may be made - # temperature: {defs.LLM_TEMPERATURE} # temperature for sampling - # top_p: {defs.LLM_TOP_P} # top-p sampling - # n: {defs.LLM_N} # Number of completions to generate - -parallelization: - stagger: {defs.PARALLELIZATION_STAGGER} - # num_threads: {defs.PARALLELIZATION_NUM_THREADS} # the number of threads to use for parallel processing - -async_mode: {defs.ASYNC_MODE.value} # or asyncio - -embeddings: - ## parallelization: override the global parallelization settings for embeddings - async_mode: {defs.ASYNC_MODE.value} # or asyncio - # target: {defs.EMBEDDING_TARGET.value} # or all - # batch_size: {defs.EMBEDDING_BATCH_SIZE} # the number of documents to send in a single request - # batch_max_tokens: {defs.EMBEDDING_BATCH_MAX_TOKENS} # the maximum number of tokens to send in a single request - vector_store:{defs.VECTOR_STORE} - # vector_store: # configuration for AI Search - # type: azure_ai_search - # url: - # api_key: # if not set, will attempt to use managed identity. Expects the `Search Index Data Contributor` RBAC role in this case. - # audience: # if using managed identity, the audience to use for the token - # overwrite: true # or false. Only applicable at index creation time - # container_name: default # A prefix for the AzureAISearch to create indexes. Default: 'default'. - llm: - api_key: ${{GRAPHRAG_API_KEY}} - type: {defs.EMBEDDING_TYPE.value} # or azure_openai_embedding - model: {defs.EMBEDDING_MODEL} - # api_base: https://.openai.azure.com - # api_version: 2024-02-15-preview - # audience: "https://cognitiveservices.azure.com/.default" - # organization: - # deployment_name: - # tokens_per_minute: 150_000 # set a leaky bucket throttle - # requests_per_minute: 10_000 # set a leaky bucket throttle - # max_retries: {defs.LLM_MAX_RETRIES} - # max_retry_wait: {defs.LLM_MAX_RETRY_WAIT} - # sleep_on_rate_limit_recommendation: true # whether to sleep when azure suggests wait-times - # concurrent_requests: {defs.LLM_CONCURRENT_REQUESTS} # the number of parallel inflight requests that may be made - -chunks: - size: {defs.CHUNK_SIZE} - overlap: {defs.CHUNK_OVERLAP} - group_by_columns: [{",".join(defs.CHUNK_GROUP_BY_COLUMNS)}] # by default, we don't allow chunks to cross documents - -input: - type: {defs.INPUT_TYPE.value} # or blob - file_type: {defs.INPUT_FILE_TYPE.value} # or csv - base_dir: "{defs.INPUT_BASE_DIR}" - file_encoding: {defs.INPUT_FILE_ENCODING} - file_pattern: ".*\\\\.txt$" - -cache: - type: {defs.CACHE_TYPE.value} # or blob - base_dir: "{defs.CACHE_BASE_DIR}" - # connection_string: - # container_name: - -storage: - type: {defs.STORAGE_TYPE.value} # or blob - base_dir: "{defs.STORAGE_BASE_DIR}" - # connection_string: - # container_name: - -update_index_storage: # Storage to save an updated index (for incremental indexing). Enabling this performs an incremental index run - # type: {defs.STORAGE_TYPE.value} # or blob - # base_dir: "{defs.UPDATE_STORAGE_BASE_DIR}" - # connection_string: - # container_name: - -reporting: - type: {defs.REPORTING_TYPE.value} # or console, blob - base_dir: "{defs.REPORTING_BASE_DIR}" - # connection_string: - # container_name: - -entity_extraction: - ## strategy: fully override the entity extraction strategy. - ## type: one of graph_intelligence, graph_intelligence_json and nltk - ## llm: override the global llm settings for this task - ## parallelization: override the global parallelization settings for this task - ## async_mode: override the global async_mode settings for this task - prompt: "prompts/entity_extraction.txt" - entity_types: [{",".join(defs.ENTITY_EXTRACTION_ENTITY_TYPES)}] - max_gleanings: {defs.ENTITY_EXTRACTION_MAX_GLEANINGS} - -summarize_descriptions: - ## llm: override the global llm settings for this task - ## parallelization: override the global parallelization settings for this task - ## async_mode: override the global async_mode settings for this task - prompt: "prompts/summarize_descriptions.txt" - max_length: {defs.SUMMARIZE_DESCRIPTIONS_MAX_LENGTH} - -claim_extraction: - ## llm: override the global llm settings for this task - ## parallelization: override the global parallelization settings for this task - ## async_mode: override the global async_mode settings for this task - # enabled: true - prompt: "prompts/claim_extraction.txt" - description: "{defs.CLAIM_DESCRIPTION}" - max_gleanings: {defs.CLAIM_MAX_GLEANINGS} - -community_reports: - ## llm: override the global llm settings for this task - ## parallelization: override the global parallelization settings for this task - ## async_mode: override the global async_mode settings for this task - prompt: "prompts/community_report.txt" - max_length: {defs.COMMUNITY_REPORT_MAX_LENGTH} - max_input_length: {defs.COMMUNITY_REPORT_MAX_INPUT_LENGTH} - -cluster_graph: - max_cluster_size: {defs.MAX_CLUSTER_SIZE} - -embed_graph: - enabled: false # if true, will generate node2vec embeddings for nodes - # num_walks: {defs.NODE2VEC_NUM_WALKS} - # walk_length: {defs.NODE2VEC_WALK_LENGTH} - # window_size: {defs.NODE2VEC_WINDOW_SIZE} - # iterations: {defs.NODE2VEC_ITERATIONS} - # random_seed: {defs.NODE2VEC_RANDOM_SEED} - -umap: - enabled: false # if true, will generate UMAP embeddings for nodes - -snapshots: - graphml: false - raw_entities: false - top_level_nodes: false - embeddings: false - transient: false - -local_search: - prompt: "prompts/local_search_system_prompt.txt" - # text_unit_prop: {defs.LOCAL_SEARCH_TEXT_UNIT_PROP} - # community_prop: {defs.LOCAL_SEARCH_COMMUNITY_PROP} - # conversation_history_max_turns: {defs.LOCAL_SEARCH_CONVERSATION_HISTORY_MAX_TURNS} - # top_k_mapped_entities: {defs.LOCAL_SEARCH_TOP_K_MAPPED_ENTITIES} - # top_k_relationships: {defs.LOCAL_SEARCH_TOP_K_RELATIONSHIPS} - # llm_temperature: {defs.LOCAL_SEARCH_LLM_TEMPERATURE} # temperature for sampling - # llm_top_p: {defs.LOCAL_SEARCH_LLM_TOP_P} # top-p sampling - # llm_n: {defs.LOCAL_SEARCH_LLM_N} # Number of completions to generate - # max_tokens: {defs.LOCAL_SEARCH_MAX_TOKENS} - -global_search: - map_prompt: "prompts/global_search_map_system_prompt.txt" - reduce_prompt: "prompts/global_search_reduce_system_prompt.txt" - knowledge_prompt: "prompts/global_search_knowledge_system_prompt.txt" - # llm_temperature: {defs.GLOBAL_SEARCH_LLM_TEMPERATURE} # temperature for sampling - # llm_top_p: {defs.GLOBAL_SEARCH_LLM_TOP_P} # top-p sampling - # llm_n: {defs.GLOBAL_SEARCH_LLM_N} # Number of completions to generate - # max_tokens: {defs.GLOBAL_SEARCH_MAX_TOKENS} - # data_max_tokens: {defs.GLOBAL_SEARCH_DATA_MAX_TOKENS} - # map_max_tokens: {defs.GLOBAL_SEARCH_MAP_MAX_TOKENS} - # reduce_max_tokens: {defs.GLOBAL_SEARCH_REDUCE_MAX_TOKENS} - # concurrency: {defs.GLOBAL_SEARCH_CONCURRENCY} - -drift_search: - prompt: "prompts/drift_search_system_prompt.txt" - # temperature: {defs.DRIFT_SEARCH_LLM_TEMPERATURE} - # top_p: {defs.DRIFT_SEARCH_LLM_TOP_P} - # n: {defs.DRIFT_SEARCH_LLM_N} - # max_tokens: {defs.DRIFT_SEARCH_MAX_TOKENS} - # data_max_tokens: {defs.DRIFT_SEARCH_DATA_MAX_TOKENS} - # concurrency: {defs.DRIFT_SEARCH_CONCURRENCY} - # drift_k_followups: {defs.DRIFT_SEARCH_K_FOLLOW_UPS} - # primer_folds: {defs.DRIFT_SEARCH_PRIMER_FOLDS} - # primer_llm_max_tokens: {defs.DRIFT_SEARCH_PRIMER_MAX_TOKENS} - # n_depth: {defs.DRIFT_N_DEPTH} - # local_search_text_unit_prop: {defs.DRIFT_LOCAL_SEARCH_TEXT_UNIT_PROP} - # local_search_community_prop: {defs.DRIFT_LOCAL_SEARCH_COMMUNITY_PROP} - # local_search_top_k_mapped_entities: {defs.DRIFT_LOCAL_SEARCH_TOP_K_MAPPED_ENTITIES} - # local_search_top_k_relationships: {defs.DRIFT_LOCAL_SEARCH_TOP_K_RELATIONSHIPS} - # local_search_max_data_tokens: {defs.DRIFT_LOCAL_SEARCH_MAX_TOKENS} - # local_search_temperature: {defs.DRIFT_LOCAL_SEARCH_LLM_TEMPERATURE} - # local_search_top_p: {defs.DRIFT_LOCAL_SEARCH_LLM_TOP_P} - # local_search_n: {defs.DRIFT_LOCAL_SEARCH_LLM_N} - # local_search_llm_max_gen_tokens: {defs.DRIFT_LOCAL_SEARCH_LLM_MAX_TOKENS} -""" - -INIT_DOTENV = """\ -GRAPHRAG_API_KEY= -""" diff --git a/graphrag/query/structured_search/drift_search/drift_context.py b/graphrag/query/structured_search/drift_search/drift_context.py index d2a271bdac..9da939d3c4 100644 --- a/graphrag/query/structured_search/drift_search/drift_context.py +++ b/graphrag/query/structured_search/drift_search/drift_context.py @@ -11,7 +11,7 @@ import pandas as pd import tiktoken -from graphrag.config.models.drift_config import DRIFTSearchConfig +from graphrag.config.models.drift_search_config import DRIFTSearchConfig from graphrag.model import ( CommunityReport, Covariate, diff --git a/graphrag/query/structured_search/drift_search/primer.py b/graphrag/query/structured_search/drift_search/primer.py index b3d2b26891..5d74ff8f0d 100644 --- a/graphrag/query/structured_search/drift_search/primer.py +++ b/graphrag/query/structured_search/drift_search/primer.py @@ -13,7 +13,7 @@ import tiktoken from tqdm.asyncio import tqdm_asyncio -from graphrag.config.models.drift_config import DRIFTSearchConfig +from graphrag.config.models.drift_search_config import DRIFTSearchConfig from graphrag.model import CommunityReport from graphrag.prompts.query.drift_search_system_prompt import ( DRIFT_PRIMER_PROMPT, diff --git a/graphrag/query/structured_search/drift_search/search.py b/graphrag/query/structured_search/drift_search/search.py index 936ea173c5..57ce4265b3 100644 --- a/graphrag/query/structured_search/drift_search/search.py +++ b/graphrag/query/structured_search/drift_search/search.py @@ -11,7 +11,7 @@ import tiktoken from tqdm.asyncio import tqdm_asyncio -from graphrag.config.models.drift_config import DRIFTSearchConfig +from graphrag.config.models.drift_search_config import DRIFTSearchConfig from graphrag.query.context_builder.conversation_history import ConversationHistory from graphrag.query.context_builder.entity_extraction import EntityVectorStoreKey from graphrag.query.llm.oai.chat_openai import ChatOpenAI diff --git a/mkdocs.yaml b/mkdocs.yaml index 0f8a0a794c..01e8a1e9e5 100644 --- a/mkdocs.yaml +++ b/mkdocs.yaml @@ -30,13 +30,6 @@ nav: - Architecture: "index/architecture.md" - Dataflow: "index/default_dataflow.md" - Outputs: "index/outputs.md" - - Configuration: - - Overview: "config/overview.md" - - Init Command: "config/init.md" - - Using Env Vars: "config/env_vars.md" - - Using JSON or YAML: "config/json_yaml.md" - - Fully Custom: "config/custom.md" - - Template: "config/template.md" - Prompt Tuning: - Overview: "prompt_tuning/overview.md" - Auto Tuning: "prompt_tuning/auto_prompt_tuning.md" @@ -52,9 +45,14 @@ nav: - Global Search: "examples_notebooks/global_search.ipynb" - Local Search: "examples_notebooks/local_search.ipynb" - DRIFT Search: "examples_notebooks/drift_search.ipynb" - - Microsoft Research Blog: "blog_posts.md" + - Configuration: + - Overview: "config/overview.md" + - Init Command: "config/init.md" + - Using YAML: "config/yaml.md" + - Using Env Vars: "config/env_vars.md" + - CLI: "cli.md" - Extras: - - CLI: "cli.md" + - Microsoft Research Blog: "blog_posts.md" - Visualization Guide: "visualization_guide.md" - Operation Dulce: - About: "data/operation_dulce/ABOUT.md" diff --git a/tests/unit/indexing/test_init_content.py b/tests/unit/indexing/test_init_content.py index eeb641cbf6..5001a2f9e5 100644 --- a/tests/unit/indexing/test_init_content.py +++ b/tests/unit/indexing/test_init_content.py @@ -10,7 +10,7 @@ GraphRagConfig, create_graphrag_config, ) -from graphrag.index.init_content import INIT_YAML +from graphrag.config.init_content import INIT_YAML def test_init_yaml(): diff --git a/v1-breaking-changes.md b/v1-breaking-changes.md index f289735579..a2ec67a5c7 100644 --- a/v1-breaking-changes.md +++ b/v1-breaking-changes.md @@ -54,4 +54,4 @@ reporting: base_dir: "output" # changed from "output/${timestamp}/reports" ``` -[Full docs on using JSON or YAML files for configuration](https://microsoft.github.io/graphrag/config/json_yaml/). +[Full docs on using YAML files for configuration](https://microsoft.github.io/graphrag/config/yaml/).