diff --git a/docs/source/users/index.md b/docs/source/users/index.md index b519f6ea4..fbaeeee73 100644 --- a/docs/source/users/index.md +++ b/docs/source/users/index.md @@ -492,6 +492,19 @@ use the `-c` or `--chunk-size` option and the `-o` or `--chunk-overlap` option. /learn --chunk-size 1000 --chunk-overlap 200 ``` +By default, `/learn` will not read directories named `node_modules`, `lib`, or `build`, +and will not read hidden files or hidden directories, where the file or directory name +starts with a `.`. To force `/learn` to read all supported file types in all directories, +use the `-a` or `--all` option. + +``` +# do not learn from hidden files, hidden directories, or node_modules, lib, or build directories +/learn + +# learn from all supported files +/learn -a +``` + ### Additional chat commands To clear the chat panel, use the `/clear` command. This does not reset the AI model; the model may still remember previous messages that you sent it, and it may use them to inform its responses. diff --git a/packages/jupyter-ai/jupyter_ai/chat_handlers/learn.py b/packages/jupyter-ai/jupyter_ai/chat_handlers/learn.py index 2d011e522..8fdffba66 100644 --- a/packages/jupyter-ai/jupyter_ai/chat_handlers/learn.py +++ b/packages/jupyter-ai/jupyter_ai/chat_handlers/learn.py @@ -38,6 +38,7 @@ def __init__( self.root_dir = root_dir self.dask_client_future = dask_client_future self.parser.prog = "/learn" + self.parser.add_argument("-a", "--all", action="store_true") self.parser.add_argument("-v", "--verbose", action="store_true") self.parser.add_argument("-d", "--delete", action="store_true") self.parser.add_argument("-l", "--list", action="store_true") @@ -115,7 +116,7 @@ async def _process_message(self, message: HumanChatMessage): if args.verbose: self.reply(f"Loading and splitting files for {load_path}", message) - await self.learn_dir(load_path, args.chunk_size, args.chunk_overlap) + await self.learn_dir(load_path, args.chunk_size, args.chunk_overlap, args.all) self.save() response = f"""🎉 I have learned documents at **{load_path}** and I am ready to answer questions about them. @@ -132,7 +133,7 @@ def _build_list_response(self): {dir_list}""" return message - async def learn_dir(self, path: str, chunk_size: int, chunk_overlap: int): + async def learn_dir(self, path: str, chunk_size: int, chunk_overlap: int, all: bool): dask_client = await self.dask_client_future splitter_kwargs = {"chunk_size": chunk_size, "chunk_overlap": chunk_overlap} splitters = { @@ -146,7 +147,7 @@ async def learn_dir(self, path: str, chunk_size: int, chunk_overlap: int): default_splitter=RecursiveCharacterTextSplitter(**splitter_kwargs), ) - delayed = split(path, splitter=splitter) + delayed = split(path, all, splitter=splitter) doc_chunks = await dask_client.compute(delayed) em_provider_cls, em_provider_args = self.get_embedding_provider() diff --git a/packages/jupyter-ai/jupyter_ai/document_loaders/directory.py b/packages/jupyter-ai/jupyter_ai/document_loaders/directory.py index 0ce4bb739..ea4db8b42 100644 --- a/packages/jupyter-ai/jupyter_ai/document_loaders/directory.py +++ b/packages/jupyter-ai/jupyter_ai/document_loaders/directory.py @@ -17,14 +17,11 @@ def path_to_doc(path): metadata = {"path": str(path), "sha256": m.digest(), "extension": path.suffix} return Document(page_content=text, metadata=metadata) - +# Unless /learn has the "all files" option passed in, files and directories beginning with '.' are excluded EXCLUDE_DIRS = { - ".ipynb_checkpoints", "node_modules", "lib", "build", - ".git", - ".DS_Store", } SUPPORTED_EXTS = { ".py", @@ -50,11 +47,15 @@ def flatten(*chunk_lists): return list(itertools.chain(*chunk_lists)) -def split(path, splitter): +def split(path, all: bool, splitter): chunks = [] for dir, _, filenames in os.walk(path): - if dir in EXCLUDE_DIRS: + if all is False and dir in EXCLUDE_DIRS: + continue + + # Exclude hidden directories + if all is False and dir[0] == '.': continue for filename in filenames: @@ -62,6 +63,10 @@ def split(path, splitter): if filepath.suffix not in SUPPORTED_EXTS: continue + # Unless we're learning "all" files, exclude hidden files + if all is False and filepath.name[0] == '.': + continue + document = dask.delayed(path_to_doc)(filepath) chunk = dask.delayed(split_document)(document, splitter) chunks.append(chunk)