implemented extract_directory_metadata to extract metadata from file …

…paths
databrickslabs · Nov 17, 2023 · 872b900 · 872b900
1 parent 92ca58e
commit 872b900
Show file tree

Hide file tree

Showing 3 changed files with 22 additions and 1 deletion.
diff --git a/README.md b/README.md
@@ -583,6 +583,7 @@ In this example, `srcDirectory` is the directory containing all intput documents
 
 The current implementation of doc splitters supports `SimpleNodeParser` of LlamaIndex, `RecursiveCharacterTextSplitter` of LangChain, and a custom tokeninzer based splitter (`TokenizerTextSpliter`). Like doc readers, the splitter classes can be extended by subclass DocSplitter. Please note that meta data extractor is supported for the `SimpleNodeParser`. A LLM instance needs to be created for the metadata extracion. The LLM definition needs to subclass `LLMDef` and override the `create` method. An example of LLM definition can be found at: [LLM notebook](./notebooks/feature_factory_llms.py).
 
+Metadata of documents can be extracted using the Metadata extractor of LlamaIndex. Feature factory also provides a method to extract metadadta from the file pathes. For example, if your documents are stored in directories of years, you can extract the year as metadata if the directories are named as `year=[actual year]`. For example, if your document has the path of /tmp/year_of_publication=2023/doc1, after splitting, each chunk from that document will have `year of publication: 2023` as the part of the header of the chunk.
 
 
 ## Project Support

diff --git a/framework/feature_factory/llm_tools.py b/framework/feature_factory/llm_tools.py
@@ -12,7 +12,7 @@
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from transformers import AutoTokenizer
 from langchain.document_loaders import UnstructuredPDFLoader
-import math
+import math, os, re
 
 
 class LLMTool(ABC):
@@ -115,6 +115,17 @@ def _to_lcdocuments(cls, docs: Union[str, List[Document], List[LCDocument]]):
                 new_docs.append(new_doc)
             return new_docs
 
+    @classmethod
+    def extract_directory_metadata(cls, fileName: str):
+        path_parts = os.path.normpath(fileName).split(os.path.sep)
+        attrs = {}
+        for part in path_parts:
+            if "=" in part:
+                attr, val = part.split('=')
+                if attr and val:
+                    attr = re.sub(r'[-_]', ' ', attr, flags=re.IGNORECASE)
+                    attrs[attr] = val
+        return attrs
 
     def apply(self, docs: Union[str, List[Document]]) -> List[str]:
         ...
@@ -204,6 +215,11 @@ def apply(self, docs: List[Document]):
         docs = DocSplitter._to_documents(docs)
         self.create()
         doc_nodes = self.node_parser.get_nodes_from_documents(docs)
+        for node in doc_nodes:
+            if 'file_path' in node.metadata:
+                filepath = node.metadata['file_path']
+                doc_attrs = DocSplitter.extract_directory_metadata(filepath)
+                node.metadata.update(doc_attrs)
         chunks = [node.get_content(metadata_mode=MetadataMode.LLM) for node in doc_nodes]
         return chunks
 

diff --git a/test/test_chunking.py b/test/test_chunking.py
@@ -130,3 +130,7 @@ class TestCatalog(LLMCatalogBase):
         assert llm_feature.name == "chunk_col_name"
         assert llm_feature.reader == TestCatalog.doc_reader
         assert llm_feature.splitter == TestCatalog.doc_splitter
+
+    def test_dir_meta_extraction(self):
+        attrs = DocSplitter.extract_directory_metadata("/tmp/year_of_publication=2023")
+        assert attrs["year of publication"] == "2023"
Original file line number	Diff line number	Diff line change
Expand Up		@@ -583,6 +583,7 @@ In this example, `srcDirectory` is the directory containing all intput documents

		The current implementation of doc splitters supports `SimpleNodeParser` of LlamaIndex, `RecursiveCharacterTextSplitter` of LangChain, and a custom tokeninzer based splitter (`TokenizerTextSpliter`). Like doc readers, the splitter classes can be extended by subclass DocSplitter. Please note that meta data extractor is supported for the `SimpleNodeParser`. A LLM instance needs to be created for the metadata extracion. The LLM definition needs to subclass `LLMDef` and override the `create` method. An example of LLM definition can be found at: [LLM notebook](./notebooks/feature_factory_llms.py).

		Metadata of documents can be extracted using the Metadata extractor of LlamaIndex. Feature factory also provides a method to extract metadadta from the file pathes. For example, if your documents are stored in directories of years, you can extract the year as metadata if the directories are named as `year=[actual year]`. For example, if your document has the path of /tmp/year_of_publication=2023/doc1, after splitting, each chunk from that document will have `year of publication: 2023` as the part of the header of the chunk.


		## Project Support
Expand Down