From 3dc3ba7855f2cda28d3d58dc330f01f1641dd7e6 Mon Sep 17 00:00:00 2001 From: akshatgurbuxani Date: Sun, 3 Nov 2024 19:44:41 -0500 Subject: [PATCH] POC commit. Authored-by: Abhaya Shukla Co-authored-by: Akshat Gurbuxani Co-authored-by: Mounika Chowdary Akuraju Co-authored-by: Duoduo Xu --- project-poc/RAG_PolicyBot.ipynb | 4507 +++++++++++++++++++++++++++++++ project-poc/chunker.ipynb | 512 ++++ 2 files changed, 5019 insertions(+) create mode 100644 project-poc/RAG_PolicyBot.ipynb create mode 100644 project-poc/chunker.ipynb diff --git a/project-poc/RAG_PolicyBot.ipynb b/project-poc/RAG_PolicyBot.ipynb new file mode 100644 index 0000000..ea1e20e --- /dev/null +++ b/project-poc/RAG_PolicyBot.ipynb @@ -0,0 +1,4507 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "id": "4RLTxvdfZy-k", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "fcedc899-7358-4cea-c3f0-389a575e3a1e" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "\u001b[?25l \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/480.6 kB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K \u001b[91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[91m╸\u001b[0m\u001b[90m━━━━━━━━━\u001b[0m \u001b[32m368.6/480.6 kB\u001b[0m \u001b[31m10.7 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\r\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m480.6/480.6 kB\u001b[0m \u001b[31m8.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h\u001b[?25l \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/116.3 kB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m116.3/116.3 kB\u001b[0m \u001b[31m5.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m179.3/179.3 kB\u001b[0m \u001b[31m9.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m134.8/134.8 kB\u001b[0m \u001b[31m7.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m194.1/194.1 kB\u001b[0m \u001b[31m10.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", + "gcsfs 2024.10.0 requires fsspec==2024.10.0, but you have fsspec 2024.9.0 which is incompatible.\u001b[0m\u001b[31m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m27.5/27.5 MB\u001b[0m \u001b[31m57.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting pypdf\n", + " Downloading pypdf-5.1.0-py3-none-any.whl.metadata (7.2 kB)\n", + "Collecting langchain_community\n", + " Downloading langchain_community-0.3.5-py3-none-any.whl.metadata (2.9 kB)\n", + "Requirement already satisfied: typing_extensions>=4.0 in /usr/local/lib/python3.10/dist-packages (from pypdf) (4.12.2)\n", + "Requirement already satisfied: PyYAML>=5.3 in /usr/local/lib/python3.10/dist-packages (from langchain_community) (6.0.2)\n", + "Collecting SQLAlchemy<2.0.36,>=1.4 (from langchain_community)\n", + " Downloading SQLAlchemy-2.0.35-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.6 kB)\n", + "Requirement already satisfied: aiohttp<4.0.0,>=3.8.3 in /usr/local/lib/python3.10/dist-packages (from langchain_community) (3.10.10)\n", + "Collecting dataclasses-json<0.7,>=0.5.7 (from langchain_community)\n", + " Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)\n", + "Collecting httpx-sse<0.5.0,>=0.4.0 (from langchain_community)\n", + " Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)\n", + "Collecting langchain<0.4.0,>=0.3.6 (from langchain_community)\n", + " Downloading langchain-0.3.7-py3-none-any.whl.metadata (7.1 kB)\n", + "Collecting langchain-core<0.4.0,>=0.3.15 (from langchain_community)\n", + " Downloading langchain_core-0.3.15-py3-none-any.whl.metadata (6.3 kB)\n", + "Requirement already satisfied: langsmith<0.2.0,>=0.1.125 in /usr/local/lib/python3.10/dist-packages (from langchain_community) (0.1.137)\n", + "Requirement already satisfied: numpy<2,>=1 in /usr/local/lib/python3.10/dist-packages (from langchain_community) (1.26.4)\n", + "Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain_community)\n", + " Downloading pydantic_settings-2.6.1-py3-none-any.whl.metadata (3.5 kB)\n", + "Requirement already satisfied: requests<3,>=2 in /usr/local/lib/python3.10/dist-packages (from langchain_community) (2.32.3)\n", + "Requirement already satisfied: tenacity!=8.4.0,<10,>=8.1.0 in /usr/local/lib/python3.10/dist-packages (from langchain_community) (9.0.0)\n", + "Requirement already satisfied: aiohappyeyeballs>=2.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain_community) (2.4.3)\n", + "Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain_community) (1.3.1)\n", + "Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain_community) (24.2.0)\n", + "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain_community) (1.5.0)\n", + "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain_community) (6.1.0)\n", + "Requirement already satisfied: yarl<2.0,>=1.12.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain_community) (1.17.0)\n", + "Requirement already satisfied: async-timeout<5.0,>=4.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain_community) (4.0.3)\n", + "Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain_community)\n", + " Downloading marshmallow-3.23.1-py3-none-any.whl.metadata (7.5 kB)\n", + "Collecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.5.7->langchain_community)\n", + " Downloading typing_inspect-0.9.0-py3-none-any.whl.metadata (1.5 kB)\n", + "Requirement already satisfied: langchain-text-splitters<0.4.0,>=0.3.0 in /usr/local/lib/python3.10/dist-packages (from langchain<0.4.0,>=0.3.6->langchain_community) (0.3.0)\n", + "Requirement already satisfied: pydantic<3.0.0,>=2.7.4 in /usr/local/lib/python3.10/dist-packages (from langchain<0.4.0,>=0.3.6->langchain_community) (2.9.2)\n", + "Requirement already satisfied: jsonpatch<2.0,>=1.33 in /usr/local/lib/python3.10/dist-packages (from langchain-core<0.4.0,>=0.3.15->langchain_community) (1.33)\n", + "Requirement already satisfied: packaging<25,>=23.2 in /usr/local/lib/python3.10/dist-packages (from langchain-core<0.4.0,>=0.3.15->langchain_community) (24.1)\n", + "Requirement already satisfied: httpx<1,>=0.23.0 in /usr/local/lib/python3.10/dist-packages (from langsmith<0.2.0,>=0.1.125->langchain_community) (0.27.2)\n", + "Requirement already satisfied: orjson<4.0.0,>=3.9.14 in /usr/local/lib/python3.10/dist-packages (from langsmith<0.2.0,>=0.1.125->langchain_community) (3.10.10)\n", + "Requirement already satisfied: requests-toolbelt<2.0.0,>=1.0.0 in /usr/local/lib/python3.10/dist-packages (from langsmith<0.2.0,>=0.1.125->langchain_community) (1.0.0)\n", + "Collecting python-dotenv>=0.21.0 (from pydantic-settings<3.0.0,>=2.4.0->langchain_community)\n", + " Downloading python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB)\n", + "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests<3,>=2->langchain_community) (3.4.0)\n", + "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests<3,>=2->langchain_community) (3.10)\n", + "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests<3,>=2->langchain_community) (2.2.3)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests<3,>=2->langchain_community) (2024.8.30)\n", + "Requirement already satisfied: greenlet!=0.4.17 in /usr/local/lib/python3.10/dist-packages (from SQLAlchemy<2.0.36,>=1.4->langchain_community) (3.1.1)\n", + "Requirement already satisfied: anyio in /usr/local/lib/python3.10/dist-packages (from httpx<1,>=0.23.0->langsmith<0.2.0,>=0.1.125->langchain_community) (3.7.1)\n", + "Requirement already satisfied: httpcore==1.* in /usr/local/lib/python3.10/dist-packages (from httpx<1,>=0.23.0->langsmith<0.2.0,>=0.1.125->langchain_community) (1.0.6)\n", + "Requirement already satisfied: sniffio in /usr/local/lib/python3.10/dist-packages (from httpx<1,>=0.23.0->langsmith<0.2.0,>=0.1.125->langchain_community) (1.3.1)\n", + "Requirement already satisfied: h11<0.15,>=0.13 in /usr/local/lib/python3.10/dist-packages (from httpcore==1.*->httpx<1,>=0.23.0->langsmith<0.2.0,>=0.1.125->langchain_community) (0.14.0)\n", + "Requirement already satisfied: jsonpointer>=1.9 in /usr/local/lib/python3.10/dist-packages (from jsonpatch<2.0,>=1.33->langchain-core<0.4.0,>=0.3.15->langchain_community) (3.0.0)\n", + "Requirement already satisfied: annotated-types>=0.6.0 in /usr/local/lib/python3.10/dist-packages (from pydantic<3.0.0,>=2.7.4->langchain<0.4.0,>=0.3.6->langchain_community) (0.7.0)\n", + "Requirement already satisfied: pydantic-core==2.23.4 in /usr/local/lib/python3.10/dist-packages (from pydantic<3.0.0,>=2.7.4->langchain<0.4.0,>=0.3.6->langchain_community) (2.23.4)\n", + "Collecting mypy-extensions>=0.3.0 (from typing-inspect<1,>=0.4.0->dataclasses-json<0.7,>=0.5.7->langchain_community)\n", + " Downloading mypy_extensions-1.0.0-py3-none-any.whl.metadata (1.1 kB)\n", + "Requirement already satisfied: propcache>=0.2.0 in /usr/local/lib/python3.10/dist-packages (from yarl<2.0,>=1.12.0->aiohttp<4.0.0,>=3.8.3->langchain_community) (0.2.0)\n", + "Requirement already satisfied: exceptiongroup in /usr/local/lib/python3.10/dist-packages (from anyio->httpx<1,>=0.23.0->langsmith<0.2.0,>=0.1.125->langchain_community) (1.2.2)\n", + "Downloading pypdf-5.1.0-py3-none-any.whl (297 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m298.0/298.0 kB\u001b[0m \u001b[31m7.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hDownloading langchain_community-0.3.5-py3-none-any.whl (2.4 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.4/2.4 MB\u001b[0m \u001b[31m35.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hDownloading dataclasses_json-0.6.7-py3-none-any.whl (28 kB)\n", + "Downloading httpx_sse-0.4.0-py3-none-any.whl (7.8 kB)\n", + "Downloading langchain-0.3.7-py3-none-any.whl (1.0 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.0/1.0 MB\u001b[0m \u001b[31m40.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hDownloading langchain_core-0.3.15-py3-none-any.whl (408 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m408.7/408.7 kB\u001b[0m \u001b[31m25.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hDownloading pydantic_settings-2.6.1-py3-none-any.whl (28 kB)\n", + "Downloading SQLAlchemy-2.0.35-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.1/3.1 MB\u001b[0m \u001b[31m56.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hDownloading marshmallow-3.23.1-py3-none-any.whl (49 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m49.5/49.5 kB\u001b[0m \u001b[31m3.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hDownloading python_dotenv-1.0.1-py3-none-any.whl (19 kB)\n", + "Downloading typing_inspect-0.9.0-py3-none-any.whl (8.8 kB)\n", + "Downloading mypy_extensions-1.0.0-py3-none-any.whl (4.7 kB)\n", + "Installing collected packages: SQLAlchemy, python-dotenv, pypdf, mypy-extensions, marshmallow, httpx-sse, typing-inspect, pydantic-settings, dataclasses-json, langchain-core, langchain, langchain_community\n", + " Attempting uninstall: SQLAlchemy\n", + " Found existing installation: SQLAlchemy 2.0.36\n", + " Uninstalling SQLAlchemy-2.0.36:\n", + " Successfully uninstalled SQLAlchemy-2.0.36\n", + " Attempting uninstall: langchain-core\n", + " Found existing installation: langchain-core 0.3.13\n", + " Uninstalling langchain-core-0.3.13:\n", + " Successfully uninstalled langchain-core-0.3.13\n", + " Attempting uninstall: langchain\n", + " Found existing installation: langchain 0.3.4\n", + " Uninstalling langchain-0.3.4:\n", + " Successfully uninstalled langchain-0.3.4\n", + "Successfully installed SQLAlchemy-2.0.35 dataclasses-json-0.6.7 httpx-sse-0.4.0 langchain-0.3.7 langchain-core-0.3.15 langchain_community-0.3.5 marshmallow-3.23.1 mypy-extensions-1.0.0 pydantic-settings-2.6.1 pypdf-5.1.0 python-dotenv-1.0.1 typing-inspect-0.9.0\n", + "Requirement already satisfied: langchain-community in /usr/local/lib/python3.10/dist-packages (0.3.5)\n", + "Requirement already satisfied: PyYAML>=5.3 in /usr/local/lib/python3.10/dist-packages (from langchain-community) (6.0.2)\n", + "Requirement already satisfied: SQLAlchemy<2.0.36,>=1.4 in /usr/local/lib/python3.10/dist-packages (from langchain-community) (2.0.35)\n", + "Requirement already satisfied: aiohttp<4.0.0,>=3.8.3 in /usr/local/lib/python3.10/dist-packages (from langchain-community) (3.10.10)\n", + "Requirement already satisfied: dataclasses-json<0.7,>=0.5.7 in /usr/local/lib/python3.10/dist-packages (from langchain-community) (0.6.7)\n", + "Requirement already satisfied: httpx-sse<0.5.0,>=0.4.0 in /usr/local/lib/python3.10/dist-packages (from langchain-community) (0.4.0)\n", + "Requirement already satisfied: langchain<0.4.0,>=0.3.6 in /usr/local/lib/python3.10/dist-packages (from langchain-community) (0.3.7)\n", + "Requirement already satisfied: langchain-core<0.4.0,>=0.3.15 in /usr/local/lib/python3.10/dist-packages (from langchain-community) (0.3.15)\n", + "Requirement already satisfied: langsmith<0.2.0,>=0.1.125 in /usr/local/lib/python3.10/dist-packages (from langchain-community) (0.1.137)\n", + "Requirement already satisfied: numpy<2,>=1 in /usr/local/lib/python3.10/dist-packages (from langchain-community) (1.26.4)\n", + "Requirement already satisfied: pydantic-settings<3.0.0,>=2.4.0 in /usr/local/lib/python3.10/dist-packages (from langchain-community) (2.6.1)\n", + "Requirement already satisfied: requests<3,>=2 in /usr/local/lib/python3.10/dist-packages (from langchain-community) (2.32.3)\n", + "Requirement already satisfied: tenacity!=8.4.0,<10,>=8.1.0 in /usr/local/lib/python3.10/dist-packages (from langchain-community) (9.0.0)\n", + "Requirement already satisfied: aiohappyeyeballs>=2.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain-community) (2.4.3)\n", + "Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain-community) (1.3.1)\n", + "Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain-community) (24.2.0)\n", + "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain-community) (1.5.0)\n", + "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain-community) (6.1.0)\n", + "Requirement already satisfied: yarl<2.0,>=1.12.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain-community) (1.17.0)\n", + "Requirement already satisfied: async-timeout<5.0,>=4.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain-community) (4.0.3)\n", + "Requirement already satisfied: marshmallow<4.0.0,>=3.18.0 in /usr/local/lib/python3.10/dist-packages (from dataclasses-json<0.7,>=0.5.7->langchain-community) (3.23.1)\n", + "Requirement already satisfied: typing-inspect<1,>=0.4.0 in /usr/local/lib/python3.10/dist-packages (from dataclasses-json<0.7,>=0.5.7->langchain-community) (0.9.0)\n", + "Requirement already satisfied: langchain-text-splitters<0.4.0,>=0.3.0 in /usr/local/lib/python3.10/dist-packages (from langchain<0.4.0,>=0.3.6->langchain-community) (0.3.0)\n", + "Requirement already satisfied: pydantic<3.0.0,>=2.7.4 in /usr/local/lib/python3.10/dist-packages (from langchain<0.4.0,>=0.3.6->langchain-community) (2.9.2)\n", + "Requirement already satisfied: jsonpatch<2.0,>=1.33 in /usr/local/lib/python3.10/dist-packages (from langchain-core<0.4.0,>=0.3.15->langchain-community) (1.33)\n", + "Requirement already satisfied: packaging<25,>=23.2 in /usr/local/lib/python3.10/dist-packages (from langchain-core<0.4.0,>=0.3.15->langchain-community) (24.1)\n", + "Requirement already satisfied: typing-extensions>=4.7 in /usr/local/lib/python3.10/dist-packages (from langchain-core<0.4.0,>=0.3.15->langchain-community) (4.12.2)\n", + "Requirement already satisfied: httpx<1,>=0.23.0 in /usr/local/lib/python3.10/dist-packages (from langsmith<0.2.0,>=0.1.125->langchain-community) (0.27.2)\n", + "Requirement already satisfied: orjson<4.0.0,>=3.9.14 in /usr/local/lib/python3.10/dist-packages (from langsmith<0.2.0,>=0.1.125->langchain-community) (3.10.10)\n", + "Requirement already satisfied: requests-toolbelt<2.0.0,>=1.0.0 in /usr/local/lib/python3.10/dist-packages (from langsmith<0.2.0,>=0.1.125->langchain-community) (1.0.0)\n", + "Requirement already satisfied: python-dotenv>=0.21.0 in /usr/local/lib/python3.10/dist-packages (from pydantic-settings<3.0.0,>=2.4.0->langchain-community) (1.0.1)\n", + "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests<3,>=2->langchain-community) (3.4.0)\n", + "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests<3,>=2->langchain-community) (3.10)\n", + "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests<3,>=2->langchain-community) (2.2.3)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests<3,>=2->langchain-community) (2024.8.30)\n", + "Requirement already satisfied: greenlet!=0.4.17 in /usr/local/lib/python3.10/dist-packages (from SQLAlchemy<2.0.36,>=1.4->langchain-community) (3.1.1)\n", + "Requirement already satisfied: anyio in /usr/local/lib/python3.10/dist-packages (from httpx<1,>=0.23.0->langsmith<0.2.0,>=0.1.125->langchain-community) (3.7.1)\n", + "Requirement already satisfied: httpcore==1.* in /usr/local/lib/python3.10/dist-packages (from httpx<1,>=0.23.0->langsmith<0.2.0,>=0.1.125->langchain-community) (1.0.6)\n", + "Requirement already satisfied: sniffio in /usr/local/lib/python3.10/dist-packages (from httpx<1,>=0.23.0->langsmith<0.2.0,>=0.1.125->langchain-community) (1.3.1)\n", + "Requirement already satisfied: h11<0.15,>=0.13 in /usr/local/lib/python3.10/dist-packages (from httpcore==1.*->httpx<1,>=0.23.0->langsmith<0.2.0,>=0.1.125->langchain-community) (0.14.0)\n", + "Requirement already satisfied: jsonpointer>=1.9 in /usr/local/lib/python3.10/dist-packages (from jsonpatch<2.0,>=1.33->langchain-core<0.4.0,>=0.3.15->langchain-community) (3.0.0)\n", + "Requirement already satisfied: annotated-types>=0.6.0 in /usr/local/lib/python3.10/dist-packages (from pydantic<3.0.0,>=2.7.4->langchain<0.4.0,>=0.3.6->langchain-community) (0.7.0)\n", + "Requirement already satisfied: pydantic-core==2.23.4 in /usr/local/lib/python3.10/dist-packages (from pydantic<3.0.0,>=2.7.4->langchain<0.4.0,>=0.3.6->langchain-community) (2.23.4)\n", + "Requirement already satisfied: mypy-extensions>=0.3.0 in /usr/local/lib/python3.10/dist-packages (from typing-inspect<1,>=0.4.0->dataclasses-json<0.7,>=0.5.7->langchain-community) (1.0.0)\n", + "Requirement already satisfied: propcache>=0.2.0 in /usr/local/lib/python3.10/dist-packages (from yarl<2.0,>=1.12.0->aiohttp<4.0.0,>=3.8.3->langchain-community) (0.2.0)\n", + "Requirement already satisfied: exceptiongroup in /usr/local/lib/python3.10/dist-packages (from anyio->httpx<1,>=0.23.0->langsmith<0.2.0,>=0.1.125->langchain-community) (1.2.2)\n", + "Requirement already satisfied: langchain in /usr/local/lib/python3.10/dist-packages (0.3.7)\n", + "Requirement already satisfied: transformers in /usr/local/lib/python3.10/dist-packages (4.44.2)\n", + "Collecting transformers\n", + " Downloading transformers-4.46.1-py3-none-any.whl.metadata (44 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m44.1/44.1 kB\u001b[0m \u001b[31m1.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: PyYAML>=5.3 in /usr/local/lib/python3.10/dist-packages (from langchain) (6.0.2)\n", + "Requirement already satisfied: SQLAlchemy<3,>=1.4 in /usr/local/lib/python3.10/dist-packages (from langchain) (2.0.35)\n", + "Requirement already satisfied: aiohttp<4.0.0,>=3.8.3 in /usr/local/lib/python3.10/dist-packages (from langchain) (3.10.10)\n", + "Requirement already satisfied: async-timeout<5.0.0,>=4.0.0 in /usr/local/lib/python3.10/dist-packages (from langchain) (4.0.3)\n", + "Requirement already satisfied: langchain-core<0.4.0,>=0.3.15 in /usr/local/lib/python3.10/dist-packages (from langchain) (0.3.15)\n", + "Requirement already satisfied: langchain-text-splitters<0.4.0,>=0.3.0 in /usr/local/lib/python3.10/dist-packages (from langchain) (0.3.0)\n", + "Requirement already satisfied: langsmith<0.2.0,>=0.1.17 in /usr/local/lib/python3.10/dist-packages (from langchain) (0.1.137)\n", + "Requirement already satisfied: numpy<2,>=1 in /usr/local/lib/python3.10/dist-packages (from langchain) (1.26.4)\n", + "Requirement already satisfied: pydantic<3.0.0,>=2.7.4 in /usr/local/lib/python3.10/dist-packages (from langchain) (2.9.2)\n", + "Requirement already satisfied: requests<3,>=2 in /usr/local/lib/python3.10/dist-packages (from langchain) (2.32.3)\n", + "Requirement already satisfied: tenacity!=8.4.0,<10,>=8.1.0 in /usr/local/lib/python3.10/dist-packages (from langchain) (9.0.0)\n", + "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from transformers) (3.16.1)\n", + "Requirement already satisfied: huggingface-hub<1.0,>=0.23.2 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.24.7)\n", + "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from transformers) (24.1)\n", + "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (2024.9.11)\n", + "Requirement already satisfied: safetensors>=0.4.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.4.5)\n", + "Collecting tokenizers<0.21,>=0.20 (from transformers)\n", + " Downloading tokenizers-0.20.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)\n", + "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.10/dist-packages (from transformers) (4.66.6)\n", + "Requirement already satisfied: aiohappyeyeballs>=2.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (2.4.3)\n", + "Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (1.3.1)\n", + "Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (24.2.0)\n", + "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (1.5.0)\n", + "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (6.1.0)\n", + "Requirement already satisfied: yarl<2.0,>=1.12.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (1.17.0)\n", + "Requirement already satisfied: fsspec>=2023.5.0 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.23.2->transformers) (2024.9.0)\n", + "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.23.2->transformers) (4.12.2)\n", + "Requirement already satisfied: jsonpatch<2.0,>=1.33 in /usr/local/lib/python3.10/dist-packages (from langchain-core<0.4.0,>=0.3.15->langchain) (1.33)\n", + "Requirement already satisfied: httpx<1,>=0.23.0 in /usr/local/lib/python3.10/dist-packages (from langsmith<0.2.0,>=0.1.17->langchain) (0.27.2)\n", + "Requirement already satisfied: orjson<4.0.0,>=3.9.14 in /usr/local/lib/python3.10/dist-packages (from langsmith<0.2.0,>=0.1.17->langchain) (3.10.10)\n", + "Requirement already satisfied: requests-toolbelt<2.0.0,>=1.0.0 in /usr/local/lib/python3.10/dist-packages (from langsmith<0.2.0,>=0.1.17->langchain) (1.0.0)\n", + "Requirement already satisfied: annotated-types>=0.6.0 in /usr/local/lib/python3.10/dist-packages (from pydantic<3.0.0,>=2.7.4->langchain) (0.7.0)\n", + "Requirement already satisfied: pydantic-core==2.23.4 in /usr/local/lib/python3.10/dist-packages (from pydantic<3.0.0,>=2.7.4->langchain) (2.23.4)\n", + "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests<3,>=2->langchain) (3.4.0)\n", + "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests<3,>=2->langchain) (3.10)\n", + "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests<3,>=2->langchain) (2.2.3)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests<3,>=2->langchain) (2024.8.30)\n", + "Requirement already satisfied: greenlet!=0.4.17 in /usr/local/lib/python3.10/dist-packages (from SQLAlchemy<3,>=1.4->langchain) (3.1.1)\n", + "Requirement already satisfied: anyio in /usr/local/lib/python3.10/dist-packages (from httpx<1,>=0.23.0->langsmith<0.2.0,>=0.1.17->langchain) (3.7.1)\n", + "Requirement already satisfied: httpcore==1.* in /usr/local/lib/python3.10/dist-packages (from httpx<1,>=0.23.0->langsmith<0.2.0,>=0.1.17->langchain) (1.0.6)\n", + "Requirement already satisfied: sniffio in /usr/local/lib/python3.10/dist-packages (from httpx<1,>=0.23.0->langsmith<0.2.0,>=0.1.17->langchain) (1.3.1)\n", + "Requirement already satisfied: h11<0.15,>=0.13 in /usr/local/lib/python3.10/dist-packages (from httpcore==1.*->httpx<1,>=0.23.0->langsmith<0.2.0,>=0.1.17->langchain) (0.14.0)\n", + "Requirement already satisfied: jsonpointer>=1.9 in /usr/local/lib/python3.10/dist-packages (from jsonpatch<2.0,>=1.33->langchain-core<0.4.0,>=0.3.15->langchain) (3.0.0)\n", + "Requirement already satisfied: propcache>=0.2.0 in /usr/local/lib/python3.10/dist-packages (from yarl<2.0,>=1.12.0->aiohttp<4.0.0,>=3.8.3->langchain) (0.2.0)\n", + "Requirement already satisfied: exceptiongroup in /usr/local/lib/python3.10/dist-packages (from anyio->httpx<1,>=0.23.0->langsmith<0.2.0,>=0.1.17->langchain) (1.2.2)\n", + "Downloading transformers-4.46.1-py3-none-any.whl (10.0 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m10.0/10.0 MB\u001b[0m \u001b[31m65.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hDownloading tokenizers-0.20.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.0 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.0/3.0 MB\u001b[0m \u001b[31m76.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hInstalling collected packages: tokenizers, transformers\n", + " Attempting uninstall: tokenizers\n", + " Found existing installation: tokenizers 0.19.1\n", + " Uninstalling tokenizers-0.19.1:\n", + " Successfully uninstalled tokenizers-0.19.1\n", + " Attempting uninstall: transformers\n", + " Found existing installation: transformers 4.44.2\n", + " Uninstalling transformers-4.44.2:\n", + " Successfully uninstalled transformers-4.44.2\n", + "Successfully installed tokenizers-0.20.1 transformers-4.46.1\n" + ] + } + ], + "source": [ + "!pip install -q langchain\n", + "!pip install -q torch\n", + "!pip install -q transformers\n", + "!pip install -q sentence-transformers\n", + "!pip install -q datasets\n", + "!pip install -q faiss-cpu\n", + "!pip install pypdf langchain_community\n", + "!pip install -U langchain-community\n", + "!pip install -U langchain transformers" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "id": "_SXTUprdZ5mD" + }, + "outputs": [], + "source": [ + "from langchain.document_loaders import HuggingFaceDatasetLoader\n", + "from langchain.text_splitter import RecursiveCharacterTextSplitter\n", + "from langchain.embeddings import HuggingFaceEmbeddings\n", + "from langchain.vectorstores import FAISS\n", + "from transformers import AutoTokenizer, AutoModelForQuestionAnswering\n", + "from transformers import AutoTokenizer, pipeline\n", + "from langchain import HuggingFacePipeline\n", + "from langchain.chains import RetrievalQA" + ] + }, + { + "cell_type": "code", + "source": [ + "from langchain.schema import Document\n", + "import json\n", + "from google.colab import drive\n", + "drive.mount('/content/drive')\n", + "\n", + "# Path to the JSON file containing chunked data\n", + "json_file_path = '/content/drive/MyDrive/Policy_Bot_Data/Json/chunked_data_all_folders_cleaned.json'\n", + "\n", + "# Load the JSON data\n", + "with open(json_file_path, 'r') as json_file:\n", + " chunked_data = json.load(json_file)\n", + "\n", + "# Initialize list to store documents\n", + "documents = []\n", + "\n", + "# Process each entry in the JSON data\n", + "for entry in chunked_data:\n", + " # Extract fields from JSON entry\n", + " original_content = entry['content']\n", + " folder_name = entry['folder_name']\n", + " file_name = entry['file_name']\n", + "\n", + " # Create Document objects for each entry with metadata\n", + " doc = Document(\n", + " page_content=original_content,\n", + " metadata={\n", + " 'folder_name': folder_name,\n", + " 'file_name': file_name\n", + " }\n", + " )\n", + " documents.append(doc)\n", + "\n", + "# # Check the first two documents as an example\n", + "# for doc in documents[:2]:\n", + "# print(doc.metadata)\n", + "# print(doc.page_content)\n", + "# print('-' * 40)\n", + "\n", + "print(len(documents))" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "MZLxzo7k93Z5", + "outputId": "fcf34075-b8f0-489d-f849-af238e2bb4d6" + }, + "execution_count": 3, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Mounted at /content/drive\n", + "4881\n" + ] + } + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "id": "QyFo0yIciR_s", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 528, + "referenced_widgets": [ + "5602993b471c4263b91e155d1c790ad2", + "6457452f5b274d26acf0a37dcb9eff2b", + "1e76f026d8a64b3b97766488ba8c71f3", + "b1bcf2e3e3a54d9d8563c831abd71ddc", + "9d55b4a255074969a560a338f8262599", + "624e4733f6944e4392c9b466e8fc606c", + "ac54f91783b845bfa8cc7aa84d4f620f", + "da20b8e4285f48bb980e4aaca8694335", + "9ae3513da4624edc8e1c9735ea8c16b0", + "e603c7b8b8204e14aa59c2fe0dc50341", + "5765e6ce65454e38ae07afc9275eb7d8", + "db0b24ed7c27453a831a51fdf0707490", + "ed889b184f7e450799f6588eb53bb841", + "5dfb14b250824dca9e5ffcb53626ed54", + "aae698ce1a2f4f29b2d193c1c4977872", + "59fdef2a83ed485ead4b56e4748ffb42", + "8d550181e93f4c37b41173182345ae5a", + "c5a73a777d7641fe8f027c1b2c87ac1f", + "f16847bb6ed54fccaabcf452b5d1e5eb", + "3f5fa86d34d44fd6a23ac974e0583e99", + "6cd42d0f19164b8c891f4373e589c55d", + "38dbbb5629ed4b88abc5db9d77cef68e", + "0bb2a38daa734996aaba1cf69c465a1e", + "a323cb24135941acaae33f529fee0cb1", + "1d4ab49794444a7ea3b8edb8ccdbeb17", + "7bc121102f16445a9e1b94626b901c77", + "ebb4dbd9c6104ca2ad38e136beb00c9b", + "7699f1001d244bf4b805804e61bded2b", + "55d0ed4cf8034df3821a495961ca1215", + "a44592c16a6047a19cddd622a9dbde68", + "abeb90ec95964e68be21a68df2558d5e", + "5d377cd952df42dfaf7a630f02b8d9ac", + "bc2f54c8f0c145d8bbdab6abd1e9588b", + "91f5c87bc71e4451aa257700afa49467", + "03ef72ed850a4bf38d413d8930cbd209", + "00d9c5b144054262a898122e8422a336", + "7a5845987f65459885632be72a8d310f", + "23aaa3472ad5494f89af9d163418a1eb", + "5969da74bb804f379dab5796266ffe3a", + "bcb3a5a8ce5e4de888408735736264e0", + "16f7e0a07e4c4a4490f5b00252d46d70", + "af0313eb5dbf4b8b8742087c4bb45687", + "87d5ed8049844da8a85f94e299cfb3f7", + "2f747b9059804fc9818a08633bd49b3b", + "a612bdb031d94208bad491801b3093af", + "beefbe40d96b42fc8e09999fdbf4bc46", + "d00465c8e95b435696e76f970de16e33", + "699eaf4cae3d4a55af0f2bac134c87dc", + "219242eeefdd4cd88809dbcd8cdbdfa4", + "145bec2795624519ae93107f814af67b", + "f661dcec60ab461cb6040523cbbb0eed", + "053857822fc442a490b0780c5cec4fa8", + "575206fd90a1487f8183c9199e64f562", + "5887fbcd1c784892a904b683a3d03f19", + "981c1024e69344e981556e830b397276", + "8270ddb5defe4d119e442a00d0a05c85", + "cd1f47bb84ac409ca63611c45d0f0ec1", + "90980c94b3884b76b8da7c7c0cfb2191", + "99c7a245aab34a049a985fc97734cd5f", + "28cd93f930f040cd8d5f0d2cd608d133", + "07921b32eb9a4122be22914fc16d9203", + "26c2f94d3c274d93ae497506380b0fbc", + "a123f66828344557911cd7c224bc1414", + "783b0b1c77634bd5a9ed965e5f059aca", + "fd5843741b91489996b4ef82fa72683b", + "dc926d58d55347ad83ffb2de297eb111", + "378d154a0c20476a945564caa58b212f", + "3f177ec1d48f4706926c3b4a7dc8a26b", + "540e9d8ae0fa4886a201ed7005421ad4", + "5f5d871d535c4c50b09b4f2a6f152f14", + "4a24827b4da24c6299e08062b09fe543", + "0fb58f73df7d4b029651343fcf600cb4", + "e9a43c2b80694e49bd6e5828d919aad9", + "d0da86fb33224458aafb890174178f4f", + "61c5126087ee4aee86b742b0a8f6f979", + "8e6e6d73a9c74a529c1da7776a0cfa21", + "35333f2db3b04f048647dd0484b752db", + "5ee644d1ba6d4289b0fa9e85444b8322", + "44f7221e77eb4b2793d929f0a85119d0", + "a499aee201b84016ae690ba6f6891af4", + "018e11f698bd4450abc4fb0d6e8154be", + "2489c39ce41a4eccab75750a03f68b4b", + "30dcea9ce75a4736935d9c2ba1473ac1", + "92d7da101a824e0486fc2548f2ebb8eb", + "4f11b5851dfd421f991995e71971fe7b", + "9e8df063e33c4e00a09cffd791090500", + "2ed2b35398a24a618f58a2c7156a8dba", + "b5e805ad129d4cbb96bea7195adc7f01", + "f7946be6f4f04d6d8512b4f1f024b0c4", + "afc4075efc2b4f658ea79bcd7a387718", + "0277ad263ed5416f8eed03644ea51160", + "9ac629c323a448f7ad7496473104435e", + "158e432baa5545efa6a7b29b8d101047", + "12f3879713804d589e90a31df2e7b7e1", + "7744cee6e8c44d5782c8238e25fa5378", + "1fb10fe13fb041fb91b30520b7ae40c7", + "bff00b478d2943e6bf4bb4ca73ef8622", + "e82887df33474151938310cefd2ffb89", + "71684282b92b4f748736f06116c62150", + "e24d316b62bf416a88cb9b04df7be3a2", + "982b991196ff4b3f8a5e3389f69933d9", + "e451648961ae41c19ac6ba460c88285e", + "f876ce779d5141e1b300eff0cfedadae", + "d65148a4aca548f0acf6a8eccd71de53", + "d0c5716d6ede44dfb13e437179d81ddd", + "25aef018b3934bf1bf52fb987396807e", + "ba7fafbbb64740b7b4954504719cd93c", + "ad039b6e107542f389d62d78525fa836", + "7a311564af194a5c960116887a787521", + "711c79d2746e4f0f849d294ac24279f4", + "771a5c9d8e3546649388c16952590822", + "b1cc873e347348c38d7d87eecbb5a196", + "c6b0560e7b4548779686bb3da56dde31", + "a1ad3702ce41453d9da2c4270c52f44d", + "524cf5dac04a4107a454b433639ef841", + "3fc5d0342b8d4337bf0ac5370818f085", + "407e75ec746741bebc87e8c4e8ea5152", + "ae66d90c2fd44171a1fe7b6219a3f4b2", + "4a412541f969463f8cc81e8e83e7c71d", + "f008b8564f524e0bb1da660c743bad23", + "cfc5ef434cf848619372dc1e4401c029" + ] + }, + "outputId": "3e504f59-2681-41c7-cc2d-9b27ae2c38d0" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + ":11: LangChainDeprecationWarning: The class `HuggingFaceEmbeddings` was deprecated in LangChain 0.2.2 and will be removed in 1.0. An updated version of the class exists in the :class:`~langchain-huggingface package and should be used instead. To use it run `pip install -U :class:`~langchain-huggingface` and import as `from :class:`~langchain_huggingface import HuggingFaceEmbeddings``.\n", + " embeddings = HuggingFaceEmbeddings(\n", + "/usr/local/lib/python3.10/dist-packages/huggingface_hub/utils/_token.py:89: UserWarning: \n", + "The secret `HF_TOKEN` does not exist in your Colab secrets.\n", + "To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.\n", + "You will be able to reuse this secret in all of your notebooks.\n", + "Please note that authentication is recommended but still optional to access public models or datasets.\n", + " warnings.warn(\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "modules.json: 0%| | 0.00/349 [00:00=4.0 in /usr/local/lib/python3.10/dist-packages (from pypdf) (4.12.2)\n", + "Downloading pypdf-5.0.1-py3-none-any.whl (294 kB)\n", + "\u001b[?25l \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/294.5 kB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K \u001b[91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[91m╸\u001b[0m\u001b[90m━\u001b[0m \u001b[32m286.7/294.5 kB\u001b[0m \u001b[31m10.0 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\r\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m294.5/294.5 kB\u001b[0m \u001b[31m6.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hInstalling collected packages: pypdf\n", + "Successfully installed pypdf-5.0.1\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "pip install -U langchain-community" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "CbYZ89c9fB7_", + "outputId": "383703f2-8d78-40d1-dd5d-46aa2b870be9" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Collecting langchain-community\n", + " Downloading langchain_community-0.3.3-py3-none-any.whl.metadata (2.8 kB)\n", + "Requirement already satisfied: PyYAML>=5.3 in /usr/local/lib/python3.10/dist-packages (from langchain-community) (6.0.2)\n", + "Requirement already satisfied: SQLAlchemy<3,>=1.4 in /usr/local/lib/python3.10/dist-packages (from langchain-community) (2.0.36)\n", + "Requirement already satisfied: aiohttp<4.0.0,>=3.8.3 in /usr/local/lib/python3.10/dist-packages (from langchain-community) (3.10.10)\n", + "Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)\n", + " Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)\n", + "Collecting langchain<0.4.0,>=0.3.4 (from langchain-community)\n", + " Downloading langchain-0.3.4-py3-none-any.whl.metadata (7.1 kB)\n", + "Collecting langchain-core<0.4.0,>=0.3.12 (from langchain-community)\n", + " Downloading langchain_core-0.3.12-py3-none-any.whl.metadata (6.3 kB)\n", + "Collecting langsmith<0.2.0,>=0.1.125 (from langchain-community)\n", + " Downloading langsmith-0.1.137-py3-none-any.whl.metadata (13 kB)\n", + "Requirement already satisfied: numpy<2,>=1 in /usr/local/lib/python3.10/dist-packages (from langchain-community) (1.26.4)\n", + "Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)\n", + " Downloading pydantic_settings-2.6.0-py3-none-any.whl.metadata (3.5 kB)\n", + "Requirement already satisfied: requests<3,>=2 in /usr/local/lib/python3.10/dist-packages (from langchain-community) (2.32.3)\n", + "Requirement already satisfied: tenacity!=8.4.0,<10,>=8.1.0 in /usr/local/lib/python3.10/dist-packages (from langchain-community) (9.0.0)\n", + "Requirement already satisfied: aiohappyeyeballs>=2.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain-community) (2.4.3)\n", + "Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain-community) (1.3.1)\n", + "Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain-community) (24.2.0)\n", + "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain-community) (1.4.1)\n", + "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain-community) (6.1.0)\n", + "Requirement already satisfied: yarl<2.0,>=1.12.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain-community) (1.16.0)\n", + "Requirement already satisfied: async-timeout<5.0,>=4.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain-community) (4.0.3)\n", + "Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)\n", + " Downloading marshmallow-3.23.0-py3-none-any.whl.metadata (7.6 kB)\n", + "Collecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)\n", + " Downloading typing_inspect-0.9.0-py3-none-any.whl.metadata (1.5 kB)\n", + "Collecting langchain-text-splitters<0.4.0,>=0.3.0 (from langchain<0.4.0,>=0.3.4->langchain-community)\n", + " Downloading langchain_text_splitters-0.3.0-py3-none-any.whl.metadata (2.3 kB)\n", + "Requirement already satisfied: pydantic<3.0.0,>=2.7.4 in /usr/local/lib/python3.10/dist-packages (from langchain<0.4.0,>=0.3.4->langchain-community) (2.9.2)\n", + "Collecting jsonpatch<2.0,>=1.33 (from langchain-core<0.4.0,>=0.3.12->langchain-community)\n", + " Downloading jsonpatch-1.33-py2.py3-none-any.whl.metadata (3.0 kB)\n", + "Requirement already satisfied: packaging<25,>=23.2 in /usr/local/lib/python3.10/dist-packages (from langchain-core<0.4.0,>=0.3.12->langchain-community) (24.1)\n", + "Requirement already satisfied: typing-extensions>=4.7 in /usr/local/lib/python3.10/dist-packages (from langchain-core<0.4.0,>=0.3.12->langchain-community) (4.12.2)\n", + "Collecting httpx<1,>=0.23.0 (from langsmith<0.2.0,>=0.1.125->langchain-community)\n", + " Downloading httpx-0.27.2-py3-none-any.whl.metadata (7.1 kB)\n", + "Collecting orjson<4.0.0,>=3.9.14 (from langsmith<0.2.0,>=0.1.125->langchain-community)\n", + " Downloading orjson-3.10.10-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (50 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m50.6/50.6 kB\u001b[0m \u001b[31m3.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting requests-toolbelt<2.0.0,>=1.0.0 (from langsmith<0.2.0,>=0.1.125->langchain-community)\n", + " Downloading requests_toolbelt-1.0.0-py2.py3-none-any.whl.metadata (14 kB)\n", + "Collecting python-dotenv>=0.21.0 (from pydantic-settings<3.0.0,>=2.4.0->langchain-community)\n", + " Downloading python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB)\n", + "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests<3,>=2->langchain-community) (3.4.0)\n", + "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests<3,>=2->langchain-community) (3.10)\n", + "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests<3,>=2->langchain-community) (2.2.3)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests<3,>=2->langchain-community) (2024.8.30)\n", + "Requirement already satisfied: greenlet!=0.4.17 in /usr/local/lib/python3.10/dist-packages (from SQLAlchemy<3,>=1.4->langchain-community) (3.1.1)\n", + "Requirement already satisfied: anyio in /usr/local/lib/python3.10/dist-packages (from httpx<1,>=0.23.0->langsmith<0.2.0,>=0.1.125->langchain-community) (3.7.1)\n", + "Collecting httpcore==1.* (from httpx<1,>=0.23.0->langsmith<0.2.0,>=0.1.125->langchain-community)\n", + " Downloading httpcore-1.0.6-py3-none-any.whl.metadata (21 kB)\n", + "Requirement already satisfied: sniffio in /usr/local/lib/python3.10/dist-packages (from httpx<1,>=0.23.0->langsmith<0.2.0,>=0.1.125->langchain-community) (1.3.1)\n", + "Collecting h11<0.15,>=0.13 (from httpcore==1.*->httpx<1,>=0.23.0->langsmith<0.2.0,>=0.1.125->langchain-community)\n", + " Downloading h11-0.14.0-py3-none-any.whl.metadata (8.2 kB)\n", + "Collecting jsonpointer>=1.9 (from jsonpatch<2.0,>=1.33->langchain-core<0.4.0,>=0.3.12->langchain-community)\n", + " Downloading jsonpointer-3.0.0-py2.py3-none-any.whl.metadata (2.3 kB)\n", + "Requirement already satisfied: annotated-types>=0.6.0 in /usr/local/lib/python3.10/dist-packages (from pydantic<3.0.0,>=2.7.4->langchain<0.4.0,>=0.3.4->langchain-community) (0.7.0)\n", + "Requirement already satisfied: pydantic-core==2.23.4 in /usr/local/lib/python3.10/dist-packages (from pydantic<3.0.0,>=2.7.4->langchain<0.4.0,>=0.3.4->langchain-community) (2.23.4)\n", + "Collecting mypy-extensions>=0.3.0 (from typing-inspect<1,>=0.4.0->dataclasses-json<0.7,>=0.5.7->langchain-community)\n", + " Downloading mypy_extensions-1.0.0-py3-none-any.whl.metadata (1.1 kB)\n", + "Requirement already satisfied: propcache>=0.2.0 in /usr/local/lib/python3.10/dist-packages (from yarl<2.0,>=1.12.0->aiohttp<4.0.0,>=3.8.3->langchain-community) (0.2.0)\n", + "Requirement already satisfied: exceptiongroup in /usr/local/lib/python3.10/dist-packages (from anyio->httpx<1,>=0.23.0->langsmith<0.2.0,>=0.1.125->langchain-community) (1.2.2)\n", + "Downloading langchain_community-0.3.3-py3-none-any.whl (2.4 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.4/2.4 MB\u001b[0m \u001b[31m40.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hDownloading dataclasses_json-0.6.7-py3-none-any.whl (28 kB)\n", + "Downloading langchain-0.3.4-py3-none-any.whl (1.0 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.0/1.0 MB\u001b[0m \u001b[31m42.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hDownloading langchain_core-0.3.12-py3-none-any.whl (407 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m407.7/407.7 kB\u001b[0m \u001b[31m24.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hDownloading langsmith-0.1.137-py3-none-any.whl (296 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m296.9/296.9 kB\u001b[0m \u001b[31m17.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hDownloading pydantic_settings-2.6.0-py3-none-any.whl (28 kB)\n", + "Downloading httpx-0.27.2-py3-none-any.whl (76 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m76.4/76.4 kB\u001b[0m \u001b[31m6.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hDownloading httpcore-1.0.6-py3-none-any.whl (78 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m78.0/78.0 kB\u001b[0m \u001b[31m5.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hDownloading jsonpatch-1.33-py2.py3-none-any.whl (12 kB)\n", + "Downloading langchain_text_splitters-0.3.0-py3-none-any.whl (25 kB)\n", + "Downloading marshmallow-3.23.0-py3-none-any.whl (49 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m49.5/49.5 kB\u001b[0m \u001b[31m3.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hDownloading orjson-3.10.10-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (144 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m144.5/144.5 kB\u001b[0m \u001b[31m11.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hDownloading python_dotenv-1.0.1-py3-none-any.whl (19 kB)\n", + "Downloading requests_toolbelt-1.0.0-py2.py3-none-any.whl (54 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m54.5/54.5 kB\u001b[0m \u001b[31m4.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hDownloading typing_inspect-0.9.0-py3-none-any.whl (8.8 kB)\n", + "Downloading jsonpointer-3.0.0-py2.py3-none-any.whl (7.6 kB)\n", + "Downloading mypy_extensions-1.0.0-py3-none-any.whl (4.7 kB)\n", + "Downloading h11-0.14.0-py3-none-any.whl (58 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m58.3/58.3 kB\u001b[0m \u001b[31m4.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hInstalling collected packages: python-dotenv, orjson, mypy-extensions, marshmallow, jsonpointer, h11, typing-inspect, requests-toolbelt, jsonpatch, httpcore, pydantic-settings, httpx, dataclasses-json, langsmith, langchain-core, langchain-text-splitters, langchain, langchain-community\n", + "Successfully installed dataclasses-json-0.6.7 h11-0.14.0 httpcore-1.0.6 httpx-0.27.2 jsonpatch-1.33 jsonpointer-3.0.0 langchain-0.3.4 langchain-community-0.3.3 langchain-core-0.3.12 langchain-text-splitters-0.3.0 langsmith-0.1.137 marshmallow-3.23.0 mypy-extensions-1.0.0 orjson-3.10.10 pydantic-settings-2.6.0 python-dotenv-1.0.1 requests-toolbelt-1.0.0 typing-inspect-0.9.0\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "import os\n", + "import json\n", + "from langchain_community.document_loaders import PyPDFLoader\n", + "from langchain.text_splitter import RecursiveCharacterTextSplitter" + ], + "metadata": { + "id": "3k3HV15We-jE" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "import os\n", + "import json\n", + "import re\n", + "\n", + "# Path to the main dataset directory\n", + "dataset_path = '/content/drive/MyDrive/Policy_Bot_Data/dataset'\n", + "\n", + "# Initialize list to store all chunks\n", + "all_chunks = []\n", + "folder_count = 0\n", + "pdf_count = 0\n", + "\n", + "# Function to clean folder and file names\n", + "def clean_name(name):\n", + " # Remove parentheses and their content\n", + " name = re.sub(r'\\s*\\([^)]*\\)', '', name)\n", + " # Remove .pdf extension\n", + " name = re.sub(r'\\.pdf$', '', name, flags=re.IGNORECASE)\n", + " return name.strip()\n", + "\n", + "# Function to process PDF\n", + "def process_pdf(pdf_path, folder_name):\n", + " global pdf_count\n", + " try:\n", + " # Loading pdf\n", + " loader = PyPDFLoader(file_path=pdf_path)\n", + " docs_before_split = loader.load()\n", + "\n", + " # Check if the document was loaded properly\n", + " if len(docs_before_split) == 0:\n", + " print(f\"Warning: No content found in {pdf_path}\")\n", + " return\n", + "\n", + " # Initialize the text splitter\n", + " text_splitter = RecursiveCharacterTextSplitter(\n", + " chunk_size=700,\n", + " chunk_overlap=50,\n", + " )\n", + "\n", + " # Split the documents into chunks\n", + " docs_after_split = text_splitter.split_documents(docs_before_split)\n", + "\n", + " # Check if the document was split correctly\n", + " if len(docs_after_split) == 0:\n", + " print(f\"Warning: No chunks created from {pdf_path}\")\n", + " return\n", + "\n", + " # Clean the folder and file names\n", + " clean_folder_name = clean_name(folder_name)\n", + " clean_file_name = clean_name(os.path.basename(pdf_path))\n", + "\n", + " # Prepare the chunks in a dictionary format, including the cleaned folder and file name\n", + " for i, doc in enumerate(docs_after_split):\n", + " chunk_data = {\n", + " 'folder_name': clean_folder_name,\n", + " 'file_name': clean_file_name,\n", + " 'chunk_id': i + 1,\n", + " 'content': doc.page_content\n", + " }\n", + " all_chunks.append(chunk_data)\n", + " pdf_count += 1\n", + " print(f\"Processed {pdf_path}, created {len(docs_after_split)} chunks.\")\n", + " except Exception as e:\n", + " print(f\"Error processing {pdf_path}: {str(e)}\")\n", + "\n", + "# Walk through the main dataset directory and process all PDF files\n", + "for root, dirs, files in os.walk(dataset_path):\n", + " if files:\n", + " folder_count += 1 # Increment folder count when files are present\n", + " for file_name in files:\n", + " if file_name.endswith('.pdf'):\n", + " pdf_path = os.path.join(root, file_name)\n", + " folder_name = os.path.basename(root) # Get the folder name for indexing\n", + " process_pdf(pdf_path, folder_name)\n", + "\n", + "# Check if there are any chunks to save\n", + "if all_chunks:\n", + " # Save all chunks to a JSON file\n", + " output_path = '/content/chunked_data_all_folders_cleaned.json'\n", + " with open(output_path, 'w') as json_file:\n", + " json.dump(all_chunks, json_file, indent=4)\n", + " print(f\"All PDF chunks have been saved to {output_path}\")\n", + "else:\n", + " print(\"No chunks were created. Please check the input files.\")\n", + "\n", + "# Print the number of folders traversed and the number of PDFs processed\n", + "print(f\"Number of folders traversed: {folder_count}\")\n", + "print(f\"Number of PDFs processed: {pdf_count}\")\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "RIuO3bM3gHd_", + "outputId": "41ef963e-d5ea-4312-fccd-177193e19469" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Health & Wellness (HWD)/HWD-01 Wellness Policy .pdf, created 220 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Health & Wellness (HWD)/HWD-02 Phys Ed & Physical Activity.pdf, created 45 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Health & Wellness (HWD)/HWD-03 Comprehensive Health Ed.pdf, created 27 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Health & Wellness (HWD)/HWD-04 Healthy School Environment Policy.pdf, created 19 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Health & Wellness (HWD)/HWD-06 Tobacco-Nicotine Policy.pdf, created 35 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Enrollment Planning and Support (AMT)/AMT-01 Exam School Application and Admissions.pdf, created 38 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Enrollment Planning and Support (AMT)/AMT-03 DYS Committed Students.pdf, created 26 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Enrollment Planning and Support (AMT)/AMT-04 Grade Requirements.pdf, created 16 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Enrollment Planning and Support (AMT)/AMT-05 Maximum Age Assignment and Enrollment Policy.pdf, created 17 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Enrollment Planning and Support (AMT)/AMT-06 Voluntary Transfer Policy.pdf, created 5 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Enrollment Planning and Support (AMT)/AMT-07 Safety Transfer Request.pdf, created 28 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Fire Safety & Emergency Management (FSE)/FSE-01 School Safety Contingency Plans.pdf, created 95 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Fire Safety & Emergency Management (FSE)/FSE-02 Fire Safety Practices.pdf, created 39 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Fire Safety & Emergency Management (FSE)/FSE-03 Building Codes & Fire Regulations.pdf, created 10 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Fire Safety & Emergency Management (FSE)/FSE-04 Bomb Threat Procedures.pdf, created 20 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Fire Safety & Emergency Management (FSE)/FSE-05 Medical Emergency Management.pdf, created 27 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Fire Safety & Emergency Management (FSE)/FSE-06 Student Safety_Health in School Shops, Labs & Classrooms.pdf, created 41 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Fire Safety & Emergency Management (FSE)/FSE-07 Public Health & Workplace Safety.pdf, created 20 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Fire Safety & Emergency Management (FSE)/FSE-08 Safe Mode and Internal Threat Procedures .pdf, created 32 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Fire Safety & Emergency Management (FSE)/HRS-HS02 Job Sharing for Permanent Teachers and Paras.pdf, created 13 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Finance (FIN)/FIN-01 Travel Policy.pdf, created 18 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Finance (FIN)/FIN-02a Mileage Reimbursement.pdf, created 9 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Finance (FIN)/FIN-02b Mileage Reimbursement_January-June.pdf, created 9 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Finance (FIN)/FIN-03 Expenditure Reimbursement.pdf, created 12 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Finance (FIN)/FIN-04 Student Activity Accounts.pdf, created 28 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Finance (FIN)/FIN-07 Purchasing Guidelines.pdf, created 24 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Finance (FIN)/FIN-09 Private Contributions Management Guidelines.pdf, created 17 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Finance (FIN)/FIN-10 Grants Guidelines.pdf, created 36 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Finance (FIN)/FIN-11 Scholarships, Awards, and Honors for Students.pdf, created 30 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Finance (FIN)/FIN-12 Trust Funds for Schools.pdf, created 27 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Finance (FIN)/FIN-14 Overpayment of Salaries.pdf, created 4 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Finance (FIN)/FIN-16 Budget Transfers.pdf, created 12 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Finance (FIN)/FIN-19 BPS Postage & Printing Policy.pdf, created 9 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Finance (FIN)/FIN-20 Managing Stipends.pdf, created 29 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Finance (FIN)/FIN-21 BPS-Recognized Independent 501c3s.pdf, created 7 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Instructional & Information Technology (OIIT)/OIIT-01 Acceptable Use Policy.pdf, created 35 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Instructional & Information Technology (OIIT)/OIIT-02 Procuring Digital Products Guidance Document.pdf, created 11 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Instructional & Information Technology (OIIT)/OIIT-03 Technology Purchasing, Acquisition & Return Guide.pdf, created 11 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Equity (EQT)/EQT-01 Nondiscrimination and Policy Statement.pdf, created 8 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Equity (EQT)/EQT-02 Bias-Based Conduct Toward Students, Families, or Other Third Parties.pdf, created 24 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Equity (EQT)/EQT-03 Sexual Misconduct Toward Students.pdf, created 36 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Equity (EQT)/EQT-04 Students and Gender Identity.pdf, created 21 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Equity (EQT)/EQT-05 Bias-Based Conduct Toward Employees.pdf, created 18 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Equity (EQT)/EQT-06 Sexual Misconduct Toward Employees and Third Parties.pdf, created 17 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Equity (EQT)/EQT-07 Accommodating Employees.pdf, created 15 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Equity (EQT)/EQT-08 Expectant & Parenting Students.pdf, created 76 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Equity (EQT)/EQT-09 Transgender and Gender Nonconforming Employees.pdf, created 19 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Equity (EQT)/EQT-10 Opportunity & Achievement Gaps Policy Implementation.pdf, created 23 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Food and Nutrition Services (FNS)/FNS-02 Emergency Meal Procedures.pdf, created 18 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Food and Nutrition Services (FNS)/FNS-03 Competitive Foods Guidelines.pdf, created 52 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Food and Nutrition Services (FNS)/FNS-04 Responsibilities4 Regarding School Food Services.pdf, created 25 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Food and Nutrition Services (FNS)/FNS-06 Menu Standards and Guidelines.pdf, created 37 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/School Community Partners (SCP)/SCP-01 School-Community Partners.pdf, created 12 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/School Health Services (SHS)/SHS-01 Drug and Alcohol Abuse.pdf, created 20 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/School Health Services (SHS)/SHS-04 Infection Prevention Control.pdf, created 35 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/School Health Services (SHS)/SHS-05 Tuberculosis Program.pdf, created 4 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/School Health Services (SHS)/SHS-06 Immunization Law.pdf, created 16 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/School Health Services (SHS)/SHS-08 Medication Administration.pdf, created 38 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/School Health Services (SHS)/SHS-11 Life Threatening Allergies.pdf, created 24 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/School Health Services (SHS)/SHS-13 Transportation Medical Accommodation.pdf, created 23 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/School Health Services (SHS)/SHS-16 Suicide Prevention & Intervention.pdf, created 36 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/School Health Services (SHS)/SHS-20 Asthma in Schools.pdf, created 20 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/School Health Services (SHS)/SHS-21 Diabetes Policy.pdf, created 34 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/School Health Services (SHS)/SHS-22 Automatic External Defibrillator Policy.pdf, created 45 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/School Health Services (SHS)/SHS-23 Condom Accessibility.pdf, created 21 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/School Health Services (SHS)/SHS-24 Diapering and Toileting Accidents Policy.pdf, created 10 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/School Health Services (SHS)/SHS-25 Sickle Cell Disease Policy.pdf, created 30 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/School Health Services (SHS)/SHS-26 Administration of Naloxone.pdf, created 18 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Transportation (TRN)/TRN-01 Schedule of School Hours.pdf, created 6 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Transportation (TRN)/TRN-02 Student Transportation Safety and Discipline.pdf, created 27 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Transportation (TRN)/TRN-03 Field Trip & Athletics Transportation.pdf, created 26 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Legal Advisor (LGL)/LGL-01 Anti-Hazing.pdf, created 23 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Legal Advisor (LGL)/LGL-03 Public Record Requests.pdf, created 7 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Legal Advisor (LGL)/LGL-04 School Visitor Guidelines.pdf, created 31 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Legal Advisor (LGL)/LGL-05 Subpoenas.pdf, created 3 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Legal Advisor (LGL)/LGL-06 Religious Holy Days.pdf, created 4 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Legal Advisor (LGL)/LGL-07 Student Records.pdf, created 48 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Legal Advisor (LGL)/LGL-08 Adherence to Court Orders.pdf, created 3 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Legal Advisor (LGL)/LGL-09 Political Activity by Public Employees.pdf, created 14 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Legal Advisor (LGL)/LGL-10 Military Recruiters.pdf, created 6 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Legal Advisor (LGL)/LGL-14 Gathering on School Grounds, Distrib of Materials in Schools.pdf, created 8 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Legal Advisor (LGL)/LGL-15 Student Surveys.pdf, created 5 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Legal Advisor (LGL)/LGL-16 Student Health Information.pdf, created 11 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Legal Advisor (LGL)/LGL-17 Religious Expression in Schools.pdf, created 5 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Legal Advisor (LGL)/LGL-18 Display of Flag and School Ceremonies.pdf, created 5 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Legal Advisor (LGL)/LGL-19 Conflict of Interest Law-City Employees.pdf, created 58 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Legal Advisor (LGL)/LGL-20 Corporal Punishment.pdf, created 5 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Legal Advisor (LGL)/LGL-21 Use of BPS Buildings and Facilities for Political Purposes.pdf, created 5 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Legal Advisor (LGL)/LGL-22 Sexual Offender Registry Information.pdf, created 14 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Student Support Services (SSS)/SSS-02 Homeless Students.pdf, created 25 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Student Support Services (SSS)/SSS-07 Persistently Dangerous Schools Standards for Determination.pdf, created 13 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Student Support Services (SSS)/SSS-09 Employment Permit Applictions and Issuance of Work Permits.pdf, created 9 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Student Support Services (SSS)/SSS-18 Bullying Prevention and Intervention Plan.pdf, created 105 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Student Support Services (SSS)/SSS-19 Home & Hospital Instruction Policy.pdf, created 26 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Athletics (ATH)/ATH-01 Prevention and Management of Sports-Related Head Injuries.pdf, created 17 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Athletics (ATH)/ATH-02 Athletic Eligibility.pdf, created 27 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Facilities Management (FMT)/FMT-01 Performance Evaluation of Custodians.pdf, created 33 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Facilities Management (FMT)/FMT-02 Work Order Requests.pdf, created 11 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Facilities Management (FMT)/FMT-03 Renovations to School Buildings and Yards.pdf, created 18 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Facilities Management (FMT)/FMT-04 Custodial Pay Adjustments.pdf, created 5 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Facilities Management (FMT)/FMT-05 Facilities Building Permits & Conditions.pdf, created 24 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Facilities Management (FMT)/FMT-08 Recycling and Zero Waste Policy.pdf, created 22 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Facilities Management (FMT)/FMT-09 Material Distribution Procedures.pdf, created 6 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Facilities Management (FMT)/FMT-10 Integrated Pest Management (IPM).pdf, created 18 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Facilities Management (FMT)/FMT-11 Green Cleaners Policy.pdf, created 11 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Facilities Management (FMT)/FMT-12 Damage Resulting from Fire, Theft, Vandalism or Unlawful Acts.pdf, created 9 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Facilities Management (FMT)/FMT-15 SY Environmental Audit Program .pdf, created 7 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Facilities Management (FMT)/FMT-18 Science Safety in Schools.pdf, created 17 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Facilities Management (FMT)/FMT-19 Cleaning & Disinfecting Bodily Fluid Spill SOP SY23 (1).pdf, created 21 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Facilities Management (FMT)/FMT-20 Drinking Water Access Policy.pdf, created 57 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Attendance (ACA)/ACA-18 Attendance Policies & Procedures.pdf, created 116 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Safety Services (SAF)/SAF-01 Student Search Procedures.pdf, created 16 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Safety Services (SAF)/SAF-02 Weapons and Objects of No Reasonable Use.pdf, created 11 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Safety Services (SAF)/SAF-03 Locker Policy.pdf, created 9 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Safety Services (SAF)/SAF-04 Incident Data Reporting and Release.pdf, created 10 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Safety Services (SAF)/SAF-04 Incident Data Reporting and Release (1).pdf, created 10 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Safety Services (SAF)/SAF-08 Release of Students to Authorized Persons.pdf, created 12 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Safety Services (SAF)/SAF-09 Lost Children Procedures.pdf, created 16 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Safety Services (SAF)/SAF-12 School Access Control.pdf, created 17 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/English Learners (EL)/EL-04 Title I Expenditures for ELs.pdf, created 50 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/English Learners (EL)/EL-06 Initial Identification and Assessment of Multilingual Learners.pdf, created 42 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/English Learners (EL)/EL-07 Instructional System & Monitoring for Multilingual Learners.pdf, created 45 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Communications (COM)/COM-01 Communications Policy.pdf, created 7 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Communications (COM)/COM-02 Media Relations Policy.pdf, created 5 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Superintendent's Office (SUP)/SUP-19 De-Escalation and Physical Restraint Policy.docx.pdf, created 84 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Superintendent's Office (SUP)/SUP-20 Child Abuse and Neglect.pdf, created 37 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Special Education (SPE)/SPE-14 Counseling Guidelines .pdf, created 23 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Special Education (SPE)/SPE-20 SPED Screening for 3 and 4 Year Olds.pdf, created 5 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Data and Accountability (ODA)/ODA-01 Procedures for Conducting Educational Research.pdf, created 10 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Data and Accountability (ODA)/ODA-02 State Testing Security and Ethics.pdf, created 8 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Data and Accountability (ODA)/ODA-03 Guidelines and Procedures for Accessing Student Data.pdf, created 24 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Data and Accountability (ODA)/ODA-04 BPS Balanced Assessment System.pdf, created 14 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Data and Accountability (ODA)/ODA-05 BPS Survey Administration Guidelines.pdf, created 24 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Data and Accountability (ODA)/ODA-06 Participation Guidelines for Testing English Learners on Statewide Assessments.pdf, created 27 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Data and Accountability (ODA)/ODA-07 Required Documentation to Withdraw Students.pdf, created 17 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Office of Human Resources (HRS)/HRS-HS04 School Leader Screening Process.pdf, created 25 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Office of Human Resources (HRS)/HRS-HS06 Substitute Teachers.pdf, created 17 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Office of Human Resources (HRS)/HRS-HS07 Staffing Reassignment and Hiring.pdf, created 57 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Office of Human Resources (HRS)/HRS-HS07.1 Qualifications for Additional Program Areas.pdf, created 8 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Office of Human Resources (HRS)/HRS-L01 Teacher Licensure.pdf, created 18 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Office of Human Resources (HRS)/HRS-L02 Paraprofessional ESSA Requirements.pdf, created 24 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Office of Human Resources (HRS)/HRS-L03 Licensure Requirements for Principals-Heads of School and BASAS.pdf, created 15 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Office of Human Resources (HRS)/HRS-PM01 Performance Evaluation of Teachers.pdf, created 3 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Office of Human Resources (HRS)/HRS-PM02 Performance Evaluation of Instructional BASAS Administrators.pdf, created 27 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Office of Human Resources (HRS)/HRS-PM02A Performance Evaluation of Non-Instructional BASAS Administrators.pdf, created 20 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Office of Human Resources (HRS)/HRS-PM03 Performance Evaluation of Members of the Administrative Guild .pdf, created 44 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Office of Human Resources (HRS)/HRS-PM04 Performance Evaluation of Non-DESE Licensed BTU Employees.pdf, created 14 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Office of Human Resources (HRS)/HRS-PM05 Performance Evaluation of Lunch Monitors.pdf, created 18 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Office of Human Resources (HRS)/HRS-PM06 Performance Evaluation of Managerial Employees.pdf, created 23 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Office of Human Resources (HRS)/HRS-PM07 Performance Evaluation of Classroom Paraprofessionals.pdf, created 18 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Office of Human Resources (HRS)/HRS-PM07A Performance Evaluation of Non-Classroom Paraprofessionals.pdf, created 19 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Office of Human Resources (HRS)/HRS-PM08 Performance Evaluation of Bus_Cab Monitors.pdf, created 14 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Office of Human Resources (HRS)/HRS-PM09 Performance Evaluation of Cluster Substitutes.pdf, created 9 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Office of Human Resources (HRS)/HRS-PM10 Performance Evaluation of ABA Specialists.pdf, created 46 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Office of Human Resources (HRS)/HRS-PP01 Contractual Benefits_ Career Awards, Salary Lanes, Salary Steps, Academic Ladder Credits.pdf, created 29 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Office of Human Resources (HRS)/HRS-PP03 Tuition Reimbursement.pdf, created 20 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Office of Human Resources (HRS)/HRS-PP05 Attendance Monitoring.pdf, created 13 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Office of Human Resources (HRS)/HRS-PP06 Confidentiality of Personnel Records and Employment Verification.pdf, created 8 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Office of Human Resources (HRS)/HRS-PP07 Workers' Compensation Procedures.pdf, created 18 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Office of Human Resources (HRS)/HRS-PP08 Incentive for Early Notification of Termination for BTU.pdf, created 5 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Office of Human Resources (HRS)/HRS-PP09 Criminal History Screening.pdf, created 69 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Office of Human Resources (HRS)/HRS-PP11 Drug Free Workplace Policy and Procedure.pdf, created 6 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Office of Human Resources (HRS)/HRS-PP12 Massachusetts Domestic Violence Leave Policy.pdf, created 7 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Office of Human Resources (HRS)/HRS-PP13 Employee Sick Leave Policy.pdf, created 25 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Office of Human Resources (HRS)/HRS-PP13A Family and Medical Leave Act.pdf, created 25 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Office of Human Resources (HRS)/HRS-PP14 Leave for Cancer Screening and Organ Donations.pdf, created 8 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Office of Human Resources (HRS)/HRS-PP15 Sick Leave Donation Program.pdf, created 28 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Office of Human Resources (HRS)/HRS-PP16 Employee Savings and Investment Benefits.pdf, created 8 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Office of Human Resources (HRS)/HRS-PP17 Employee Resignation, Retirement, and Separation Procedure.pdf, created 16 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Office of Human Resources (HRS)/HRS-PP19 Performance-Related Dismissal Process for Teachers.pdf, created 9 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Office of Human Resources (HRS)/HRS-PP20 Changes in Pay Frequency for Paras and Comm. Field Coordinators.pdf, created 6 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Academics (CAO)/CAO-01 Promotion Policy.pdf, created 21 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Academics (CAO)/CAO-03 Textbook Management.pdf, created 25 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Academics (CAO)/CAO-05 Services for Multilingual Learner Students.pdf, created 87 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Academics (CAO)/CAO-06 GPA Calculation Method.pdf, created 10 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Academics (CAO)/CAO-07 Graduation Requirements.pdf, created 16 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Academics (CAO)/CAO-08 Grading Requirements.pdf, created 13 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Academics (CAO)/CAO-23 Day Field Trip Guidelines.pdf, created 84 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Academics (CAO)/CAO-22 General Field Trip Guidelines.pdf, created 61 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Academics (CAO)/CAO-24 Domestic Overnight Field Trip Guidelines.pdf, created 153 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Academics (CAO)/CAO-25 International Field Trips Guidelines & Forms.pdf, created 210 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Academics (CAO)/CAO-27 Water Activities on Field Trips.pdf, created 26 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Family and Community Advancement (FAM)/FAM-01 School Parent Councils.pdf, created 27 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Family and Community Advancement (FAM)/FAM-02 School Site Councils.pdf, created 33 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Family and Community Advancement (FAM)/FAM-03 Student Government.pdf, created 19 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Family and Community Advancement (FAM)/FAM-04 Personnel Subcommittee.pdf, created 12 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Family and Community Advancement (FAM)/FAM-05 Title I Family Engagement Requirements.pdf, created 15 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Family and Community Advancement (FAM)/FAM-06 Boston Student Advisory Council.pdf, created 10 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Family and Community Advancement (FAM)/FAM-07 Home-School Compact.pdf, created 12 chunks.\n", + "Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Family and Community Advancement (FAM)/FAM-08 Translation and Interpretation Services.pdf, created 18 chunks.\n", + "All PDF chunks have been saved to /content/chunked_data_all_folders_cleaned.json\n", + "Number of folders traversed: 24\n", + "Number of PDFs processed: 190\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [], + "metadata": { + "id": "79idNNJl0Zm_" + }, + "execution_count": null, + "outputs": [] + } + ] +} \ No newline at end of file