nv-morpheus · rapids-bot · Dec 18, 2024 · Nov 27, 2024 · Dec 16, 2024 · Dec 17, 2024
@@ -209,7 +209,7 @@ jobs:
     if: ${{ inputs.conda_run_build }}
     needs: [documentation, test]
     runs-on: linux-amd64-gpu-v100-latest-1
-    timeout-minutes: 60
+    timeout-minutes: 90
     container:
       image: ${{ inputs.base_container }}
       options: --cap-add=sys_nice

@@ -80,7 +80,7 @@ dependencies:
 - numexpr
 - numpydoc=1.5
 - onnx=1.15
-- openai=1.13
+- openai==1.13.*
 - papermill=2.4.0
 - pip
 - pkg-config=0.29

@@ -38,7 +38,7 @@ dependencies:
 - numexpr
 - numpydoc=1.5
 - onnx=1.15
-- openai=1.13
+- openai==1.13.*
 - papermill=2.4.0
 - pip
 - pluggy=1.3

@@ -436,6 +436,7 @@ dependencies:
            - &langchain-nvidia-ai-endpoints langchain-nvidia-ai-endpoints==0.0.11
            - &langchain-openai langchain-openai==0.1.3
            - milvus==2.3.5 # update to match pymilvus when available
+           - &openai openai==1.13.*
            - pymilvus==2.3.6
            - &nemollm nemollm==0.3.5
 
@@ -494,7 +495,7 @@ dependencies:
           - newspaper3k=0.2
           - numexpr
           - onnx=1.15
-          - openai=1.13
+          - *openai
           - pypdf=3.17.4
           - *pypdfium2
           - *python-docx

@@ -413,7 +413,7 @@ else:
     pipeline.add_stage(RecipientFeaturesStage(config))
 ```
 
-To tokenize the input data we will use Morpheus' `PreprocessNLPStage`. This stage uses the [cuDF subword tokenizer](https://docs.rapids.ai/api/cudf/stable/user_guide/api_docs/subword_tokenize/#subwordtokenizer) to transform strings into a tensor of numbers to be fed into the neural network model. Rather than split the string by characters or whitespaces, we split them into meaningful subwords based upon the occurrence of the subwords in a large training corpus. You can find more details here: [https://arxiv.org/abs/1810.04805v2](https://arxiv.org/abs/1810.04805v2). All we need to know for now is that the text will be converted to subword token ids based on the vocabulary file that we provide (`vocab_hash_file=vocab file`).
+To tokenize the input data we will use Morpheus' `PreprocessNLPStage`. This stage uses the [cuDF subword tokenizer](https://docs.rapids.ai/api/cudf/legacy/user_guide/api_docs/subword_tokenize/#subwordtokenizer) to transform strings into a tensor of numbers to be fed into the neural network model. Rather than split the string by characters or whitespaces, we split them into meaningful subwords based upon the occurrence of the subwords in a large training corpus. You can find more details here: [https://arxiv.org/abs/1810.04805v2](https://arxiv.org/abs/1810.04805v2). All we need to know for now is that the text will be converted to subword token ids based on the vocabulary file that we provide (`vocab_hash_file=vocab file`).
 
 Let's go ahead and instantiate our `PreprocessNLPStage` and add it to the pipeline:
 

@@ -1,20 +1,24 @@
 from __future__ import annotations
 import morpheus._lib.cudf_helpers
 import typing
+from cudf.core.column.column import ColumnBase
 from cudf.core.buffer.exposure_tracked_buffer import ExposureTrackedBuffer
 from cudf.core.buffer.spillable_buffer import SpillableBuffer
 from cudf.core.dtypes import StructDtype
 import _cython_3_0_11
 import cudf
+import itertools
 import rmm
 
 __all__ = [
+    "ColumnBase",
     "ExposureTrackedBuffer",
     "SpillableBuffer",
     "StructDtype",
     "as_buffer",
     "bitmask_allocation_size_bytes",
     "cudf",
+    "itertools",
     "rmm"
 ]
 

@@ -8,5 +8,6 @@ langchain-openai==0.1.3
 langchain==0.1.16
 milvus==2.3.5
 nemollm==0.3.5
+openai==1.13.*
 pymilvus==2.3.6
 torch==2.4.0+cu124
@@ -1125,6 +1125,16 @@ def langchain_community_fixture(fail_missing: bool):
                          fail_missing=fail_missing)
 
 
+@pytest.fixture(name="langchain_openai", scope='session')
+def langchain_openai_fixture(fail_missing: bool):
+    """
+    Fixture to ensure langchain_openai is installed
+    """
+    yield import_or_skip("langchain_openai",
+                         reason=OPT_DEP_SKIP_REASON.format(package="langchain_openai"),
+                         fail_missing=fail_missing)
+
+
 @pytest.fixture(name="langchain_nvidia_ai_endpoints", scope='session')
 def langchain_nvidia_ai_endpoints_fixture(fail_missing: bool):
     """
@@ -1145,6 +1155,14 @@ def databricks_fixture(fail_missing: bool):
                          fail_missing=fail_missing)
 
 
+@pytest.fixture(name="numexpr", scope='session')
+def numexpr_fixture(fail_missing: bool):
+    """
+    Fixture to ensure numexpr is installed
+    """
+    yield import_or_skip("numexpr", reason=OPT_DEP_SKIP_REASON.format(package="numexpr"), fail_missing=fail_missing)
+
+
 @pytest.mark.usefixtures("openai")
 @pytest.fixture(name="mock_chat_completion")
 def mock_chat_completion_fixture():

@@ -61,6 +61,14 @@ def langchain_community_fixture(langchain_community: types.ModuleType):
     yield langchain_community
 
 
+@pytest.fixture(name="langchain_openai", scope='session', autouse=True)
+def langchain_openai_fixture(langchain_openai: types.ModuleType):
+    """
+    Fixture to ensure langchain_openai is installed
+    """
+    yield langchain_openai
+
+
 @pytest.fixture(name="langchain_nvidia_ai_endpoints", scope='session', autouse=True)
 def langchain_nvidia_ai_endpoints_fixture(langchain_nvidia_ai_endpoints: types.ModuleType):
     """

@@ -127,7 +127,7 @@ def test_agents_simple_pipe_integration_openai(config: Config, questions: list[s
     assert float(response_match.group(1)) >= 3.7
 
 
-@pytest.mark.usefixtures("openai", "restore_environ")
+@pytest.mark.usefixtures("langchain_community", "langchain_openai", "numexpr", "openai", "restore_environ")
 @mock.patch("langchain_community.utilities.serpapi.SerpAPIWrapper.aresults")
 @mock.patch("langchain_openai.OpenAI._agenerate",
             autospec=True)  # autospec is needed as langchain will inspect the function