Version 1.3.3

abacusai · May 30, 2024 · 934b449 · 934b449
1 parent 19c5599
commit 934b449
Show file tree

Hide file tree

Showing 473 changed files with 63,592 additions and 132,057 deletions.
diff --git a/abacusai/__init__.py b/abacusai/__init__.py
@@ -4,4 +4,4 @@
 from .streaming_client import StreamingClient
 
 
-__version__ = "1.3.2"
+__version__ = "1.3.3"
diff --git a/abacusai/api_class/__init__.py b/abacusai/api_class/__init__.py
@@ -2,6 +2,7 @@
 from .ai_chat import *
 from .batch_prediction import *
 from .blob_input import *
+from .connectors import *
 from .dataset import *
 from .dataset_application_connector import *
 from .deployment import *

diff --git a/abacusai/api_class/ai_agents.py b/abacusai/api_class/ai_agents.py
@@ -28,7 +28,7 @@ class WorkflowNodeInputSchema(ApiClass):
     A react-jsonschema-form conformant schema for workflow node input.
 
     Args:
-        json_schema (dict): The json schema for the input conformant to react-jsonschema-form specification. Must define keys like "title", "type" and "properties". Supported elements - Checkbox, Radio Button, Dropdown, Textarea, Number, Date, File Upload. Not supported - Nested elements, arrays and other complex types.
+        json_schema (dict): The json schema for the input conformant to react-jsonschema-form specification. Must define keys like "title", "type" and "properties". Supported elements - Checkbox, Radio Button, Dropdown, Textarea, Number, Date, file upload. Not supported - Nested elements, arrays and other complex types.
         ui_schema (dict): The ui schema for the input conformant to react-jsonschema-form specification.
     """
     json_schema: dict
@@ -99,6 +99,8 @@ def to_dict(self):
 
     @classmethod
     def from_dict(cls, mapping: dict):
+        if any(field not in mapping for field in ['name', 'variable_type']):
+            raise ValueError('Invalid workflow node input mapping. Must contain keys - name, variable_type')
         return cls(
             name=mapping['name'],
             variable_type=mapping['variable_type'],
@@ -179,6 +181,8 @@ def to_dict(self):
 
     @classmethod
     def from_dict(cls, node: dict):
+        if any(field not in node for field in ['name', 'function_name', 'source_code', 'input_mappings', 'output_mappings']):
+            raise ValueError('Invalid workflow graph node. Must contain keys - name, function_name, source_code, input_mappings, output_mappings.')
         return cls(
             name=node['name'],
             function_name=node['function_name'],

diff --git a/abacusai/api_class/blob_input.py b/abacusai/api_class/blob_input.py
@@ -5,9 +5,9 @@
 
 
 @dataclasses.dataclass
-class BlobInput(ApiClass):
+class Blob(ApiClass):
     """
-    Binary large object input data.
+    Binary large object data.
 
     Args:
         filename (str): The original filename of the blob.
@@ -20,14 +20,44 @@ class BlobInput(ApiClass):
     mime_type: str
     size: int
 
+    def __init__(self, contents: bytes, mime_type: str = None, filename: str = None, size: int = None):
+        if contents is None or not isinstance(contents, bytes):
+            raise ValueError('contents must be a valid bytes object')
+        if mime_type is None:
+            try:
+                if filename:
+                    mime_type = mimetypes.guess_type(filename)[0]
+                else:
+                    import magic
+                    mime_type = magic.Magic(mime=True).from_buffer(contents)
+            except Exception:
+                pass
+        else:
+            if not isinstance(mime_type, str):
+                raise ValueError('mime_type must be a valid string')
+
+        self.filename = filename
+        self.contents = contents
+        self.mime_type = mime_type
+        self.size = size or len(contents)
+
     @classmethod
-    def from_local_file(cls, file_path: str) -> 'BlobInput':
+    def from_local_file(cls, file_path: str) -> 'Blob':
         with open(file_path, 'rb') as f:
             contents = f.read()
         return cls.from_contents(contents, filename=file_path)
 
     @classmethod
-    def from_contents(cls, contents: bytes, filename: str = None, mime_type: str = None) -> 'BlobInput':
-        if not mime_type and filename:
-            mime_type = mimetypes.guess_type(filename)[0]
+    def from_contents(cls, contents: bytes, filename: str = None, mime_type: str = None) -> 'Blob':
         return cls(filename=filename, contents=contents, mime_type=mime_type, size=len(contents))
+
+
+@dataclasses.dataclass
+class BlobInput(Blob):
+    """
+    Binary large object data.
+    An alias for Blob, used to indicate that this is an input object.
+    """
+
+    def __init__(self, filename: str = None, contents: bytes = None, mime_type: str = None, size: int = None):
+        super().__init__(contents, mime_type, filename, size)
diff --git a/abacusai/api_class/connectors.py b/abacusai/api_class/connectors.py
@@ -28,7 +28,7 @@ class KafkaDatasetConfig(StreamingConnectorDatasetConfig):
     Args:
         topic (str): The kafka topic to consume
     """
-    topic: str
+    topic: str = dataclasses.field(default=None)
 
     def __post_init__(self):
         self.streaming_connector_type = enums.StreamingConnectorType.KAFKA

diff --git a/abacusai/api_class/dataset.py b/abacusai/api_class/dataset.py
@@ -42,6 +42,7 @@ class DocumentProcessingConfig(ApiClass):
         remove_header_footer (bool): Whether to remove headers and footers. Defaults to False. This option only takes effect when extract_bounding_boxes is True.
         remove_watermarks (bool): Whether to remove watermarks. By default, it will be decided automatically based on the OCR mode and the document type. This option only takes effect when extract_bounding_boxes is True.
         convert_to_markdown (bool): Whether to convert extracted text to markdown. Defaults to False. This option only takes effect when extract_bounding_boxes is True.
+        return_links (bool): Whether to augment the extracted text with embedded URLs in the PDF. Defaults to False. This option only takes effect when the document is a PDF.
     """
     # NOTE: The defaults should match with clouddb.document_processing_results table defaults
     extract_bounding_boxes: bool = False
@@ -50,6 +51,7 @@ class DocumentProcessingConfig(ApiClass):
     remove_header_footer: bool = False
     remove_watermarks: bool = True
     convert_to_markdown: bool = False
+    return_links: bool = False
 
 
 @dataclasses.dataclass

diff --git a/abacusai/api_class/enums.py b/abacusai/api_class/enums.py
@@ -128,6 +128,17 @@ class MergeMode(ApiEnum):
     TIME_WINDOW = 'TIME_WINDOW'
 
 
+class OperatorType(ApiEnum):
+    UNPIVOT = 'UNPIVOT'
+    MARKDOWN = 'MARKDOWN'
+    CRAWLER = 'CRAWLER'
+    EXTRACT_DOCUMENT_DATA = 'EXTRACT_DOCUMENT_DATA'
+
+
+class MarkdownOperatorInputType(ApiEnum):
+    HTML = 'HTML'
+
+
 class FillLogic(ApiEnum):
     # back / future
     AVERAGE = 'average'
@@ -406,6 +417,7 @@ class VectorStoreTextEncoder(ApiEnum):
     OPENAI = 'OPENAI'
     SENTENCE_BERT = 'SENTENCE_BERT'
     E5_SMALL = 'E5_SMALL'
+    CODE_BERT = 'CODE_BERT'
 
 
 class LLMName(ApiEnum):
@@ -517,6 +529,7 @@ class OcrMode(ApiEnum):
     COMPREHENSIVE = 'COMPREHENSIVE'
     COMPREHENSIVE_V2 = 'COMPREHENSIVE_V2'
     COMPREHENSIVE_TABLE_MD = 'COMPREHENSIVE_TABLE_MD'
+    TESSERACT_FAST = 'TESSERACT_FAST'
 
 
 class StdDevThresholdType(ApiEnum):
@@ -535,6 +548,7 @@ class DataType(ApiEnum):
     LIST = 'list'
     STRUCT = 'struct'
     NULL = 'null'
+    BINARY = 'binary'
 
 
 class AgentInterface(ApiEnum):
@@ -545,4 +559,26 @@ class AgentInterface(ApiEnum):
 
 class ProjectConfigType(ApiEnum):
     CONSTRAINTS = 'CONSTRAINTS'
+    CHAT_FEEDBACK = 'CHAT_FEEDBACK'
     REVIEW_MODE = 'REVIEW_MODE'
+
+
+class CPUSize(ApiEnum):
+    SMALL = 'small'
+    MEDIUM = 'medium'
+    LARGE = 'large'
+
+
+class MemorySize(ApiEnum):
+    SMALL = 16
+    MEDIUM = 32
+    LARGE = 64
+    XLARGE = 128
+
+    @classmethod
+    def from_value(cls, value):
+        sorted_members = sorted(cls, key=lambda mem: mem.value)
+        for member in sorted_members:
+            if member.value >= value:
+                return member
+        return None
diff --git a/abacusai/api_class/feature_group.py b/abacusai/api_class/feature_group.py
@@ -3,6 +3,7 @@
 
 from . import enums
 from .abstract import ApiClass, _ApiClassFactory
+from .dataset import DocumentProcessingConfig
 
 
 @dataclasses.dataclass
@@ -27,7 +28,6 @@ class NSamplingConfig(SamplingConfig):
     The number of distinct values of the key columns to include in the sample, or number of rows if key columns not specified.
 
     Args:
-        sampling_method (SamplingMethodType): N_SAMPLING
         sample_count (int): The number of rows to include in the sample
         key_columns (List[str]): The feature(s) to use as the key(s) when sampling
     """
@@ -44,7 +44,6 @@ class PercentSamplingConfig(SamplingConfig):
     The fraction of distinct values of the feature group to include in the sample.
 
     Args:
-        sampling_method (SamplingMethodType): PERCENT_SAMPLING
         sample_percent (float): The percentage of the rows to sample
         key_columns (List[str]): The feature(s) to use as the key(s) when sampling
     """
@@ -87,7 +86,6 @@ class LastNMergeConfig(MergeConfig):
     Merge LAST N chunks/versions of an incremental dataset.
 
     Args:
-        merge_mode (MergeMode): LAST_N
         num_versions (int): The number of versions to merge. num_versions == 0 means merge all versions.
         include_version_timestamp_column (bool): If set, include a column with the creation timestamp of source FG versions.
     """
@@ -104,7 +102,6 @@ class TimeWindowMergeConfig(MergeConfig):
     Merge rows within a given timewindow of the most recent timestamp
 
     Args:
-        merge_mode (MergeMode): TIME_WINDOW
         feature_name (str): Time based column to index on
         time_window_size_ms (int): Range of merged rows will be [MAX_TIME - time_window_size_ms, MAX_TIME]
         include_version_timestamp_column (bool): If set, include a column with the creation timestamp of source FG versions.
@@ -125,3 +122,109 @@ class _MergeConfigFactory(_ApiClassFactory):
         enums.MergeMode.LAST_N: LastNMergeConfig,
         enums.MergeMode.TIME_WINDOW: TimeWindowMergeConfig,
     }
+
+
+@dataclasses.dataclass
+class OperatorConfig(ApiClass):
+    """Configuration for a template Feature Group Operation"""
+    operator_type: enums.OperatorType = dataclasses.field(default=None, repr=False, init=False)
+
+    @classmethod
+    def _get_builder(cls):
+        return _OperatorConfigFactory
+
+    def __post_init__(self):
+        if self.__class__ == OperatorConfig:
+            raise TypeError('Cannot instantiate abstract OperatorConfig class.')
+
+
+@dataclasses.dataclass
+class UnpivotConfig(OperatorConfig):
+    """ Unpivot Columns in a FeatureGroup.
+
+    Args:
+        columns (List[str]): Which columns to unpivot.
+        index_column (str): Name of new column containing the unpivoted column names as its values
+        value_column (str): Name of new column containing the row values that were unpivoted.
+        exclude (bool): If True, the unpivoted columns are all the columns EXCEPT the ones in the columns argument. Default is False.
+    """
+
+    columns: List[str] = dataclasses.field(default=None)
+    index_column: str = dataclasses.field(default=None)
+    value_column: str = dataclasses.field(default=None)
+    exclude: bool = dataclasses.field(default=None)
+
+    def __post_init__(self):
+        self.operator_type = enums.OperatorType.UNPIVOT
+
+
+@dataclasses.dataclass
+class MarkdownConfig(OperatorConfig):
+    """ Transform a input column to a markdown column.
+
+    Args:
+        input_column (str): Name of input column to transform.
+        output_column (str): Name of output column to store transformed data.
+        input_column_type (MarkdownOperatorInputType): Type of input column to transform.
+    """
+    input_column: str = dataclasses.field(default=None)
+    output_column: str = dataclasses.field(default=None)
+    input_column_type: enums.MarkdownOperatorInputType = dataclasses.field(default=None)
+
+    def __post_init__(self):
+        self.operator_type = enums.OperatorType.MARKDOWN
+
+
+@dataclasses.dataclass
+class CrawlerTransformConfig(OperatorConfig):
+    """ Transform a input column of urls to html text
+
+    Args:
+        input_column (str): Name of input column to transform.
+        output_column (str): Name of output column to store transformed data.
+        depth_column (str): Increasing depth explores more links, capturing more content
+        disable_host_restriction (bool): If True, will not restrict crawling to the same host.
+        honour_website_rules (bool): If True, will respect robots.txt rules.
+        user_agent (str): If provided, will use this user agent instead of randomly selecting one.
+    """
+    input_column: str = dataclasses.field(default=None)
+    output_column: str = dataclasses.field(default=None)
+    depth_column: str = dataclasses.field(default=None)
+    input_column_type: str = dataclasses.field(default=None, metadata={'deprecated': True})
+    crawl_depth: int = dataclasses.field(default=None, metadata={'deprecated': True})
+    disable_host_restriction: bool = dataclasses.field(default=None)
+    honour_website_rules: bool = dataclasses.field(default=None)
+    user_agent: str = dataclasses.field(default=None)
+
+    def __post_init__(self):
+        self.operator_type = enums.OperatorType.CRAWLER
+
+
+@dataclasses.dataclass
+class ExtractDocumentDataConfig(OperatorConfig):
+    """ Extracts data from documents.
+
+    Args:
+        doc_id_column (str): Name of input document ID column.
+        document_column (str): Name of the input document column which contains the page infos. This column will be transformed to include the document processing config in the output feature group.
+        document_processing_config (DocumentProcessingConfig): Document processing configuration.
+    """
+    doc_id_column: str = dataclasses.field(default=None)
+    document_column: str = dataclasses.field(default=None)
+    document_processing_config: DocumentProcessingConfig = dataclasses.field(default=None)
+
+    def __post_init__(self):
+        self.operator_type = enums.OperatorType.EXTRACT_DOCUMENT_DATA
+
+
+@dataclasses.dataclass
+class _OperatorConfigFactory(_ApiClassFactory):
+    """A class to select and return the the correct type of Operator Config based on a serialized OperatorConfig instance. """
+    config_abstract_class = OperatorConfig
+    config_class_key = 'operator_type'
+    config_class_map = {
+        enums.OperatorType.UNPIVOT: UnpivotConfig,
+        enums.OperatorType.MARKDOWN: MarkdownConfig,
+        enums.OperatorType.CRAWLER: CrawlerTransformConfig,
+        enums.OperatorType.EXTRACT_DOCUMENT_DATA: ExtractDocumentDataConfig,
+    }
diff --git a/abacusai/api_class/model.py b/abacusai/api_class/model.py
@@ -170,6 +170,7 @@ class RegressionTrainingConfig(TrainingConfig):
         custom_loss_functions (List[str]): Registered custom losses available for selection.
         custom_metrics (List[str]): Registered custom metrics available for selection.
         partial_dependence_analysis (PartialDependenceAnalysis): Specify whether to run partial dependence plots for all features or only some features.
+        do_masked_language_model_pretraining (bool): Specify whether to run a masked language model unsupervised pretraining step before supervized training in certain supported algorithms which use BERT-like backbones.
     """
     objective: enums.RegressionObjective = dataclasses.field(default=None)
     sort_objective: enums.RegressionObjective = dataclasses.field(default=None)
@@ -213,6 +214,7 @@ class RegressionTrainingConfig(TrainingConfig):
     dropout_rate: int = dataclasses.field(default=None)
     pretrained_model_name: str = dataclasses.field(default=None)
     is_multilingual: bool = dataclasses.field(default=None)
+    do_masked_language_model_pretraining: bool = dataclasses.field(default=None)
 
     # loss function
     loss_function: enums.RegressionLossFunction = dataclasses.field(default=None)

diff --git a/abacusai/chat_message.py b/abacusai/chat_message.py
@@ -15,9 +15,10 @@ class ChatMessage(AbstractApiClass):
             docIds (list[str]): A list of IDs of the uploaded document if the message has
             hotkeyTitle (str): The title of the hotkey prompt if the message has one
             tasks (list[str]): The list of spawned tasks, if the message was broken down into smaller sub-tasks.
+            keywordArguments (dict): A dict of kwargs used to generate the response.
     """
 
-    def __init__(self, client, role=None, text=None, timestamp=None, isUseful=None, feedback=None, docIds=None, hotkeyTitle=None, tasks=None):
+    def __init__(self, client, role=None, text=None, timestamp=None, isUseful=None, feedback=None, docIds=None, hotkeyTitle=None, tasks=None, keywordArguments=None):
         super().__init__(client, None)
         self.role = role
         self.text = text
@@ -27,11 +28,12 @@ def __init__(self, client, role=None, text=None, timestamp=None, isUseful=None,
         self.doc_ids = docIds
         self.hotkey_title = hotkeyTitle
         self.tasks = tasks
+        self.keyword_arguments = keywordArguments
         self.deprecated_keys = {}
 
     def __repr__(self):
-        repr_dict = {f'role': repr(self.role), f'text': repr(self.text), f'timestamp': repr(self.timestamp), f'is_useful': repr(
-            self.is_useful), f'feedback': repr(self.feedback), f'doc_ids': repr(self.doc_ids), f'hotkey_title': repr(self.hotkey_title), f'tasks': repr(self.tasks)}
+        repr_dict = {f'role': repr(self.role), f'text': repr(self.text), f'timestamp': repr(self.timestamp), f'is_useful': repr(self.is_useful), f'feedback': repr(
+            self.feedback), f'doc_ids': repr(self.doc_ids), f'hotkey_title': repr(self.hotkey_title), f'tasks': repr(self.tasks), f'keyword_arguments': repr(self.keyword_arguments)}
         class_name = "ChatMessage"
         repr_str = ',\n  '.join([f'{key}={value}' for key, value in repr_dict.items(
         ) if getattr(self, key, None) is not None and key not in self.deprecated_keys])
@@ -44,6 +46,6 @@ def to_dict(self):
         Returns:
             dict: The dict value representation of the class parameters
         """
-        resp = {'role': self.role, 'text': self.text, 'timestamp': self.timestamp, 'is_useful': self.is_useful,
-                'feedback': self.feedback, 'doc_ids': self.doc_ids, 'hotkey_title': self.hotkey_title, 'tasks': self.tasks}
+        resp = {'role': self.role, 'text': self.text, 'timestamp': self.timestamp, 'is_useful': self.is_useful, 'feedback': self.feedback,
+                'doc_ids': self.doc_ids, 'hotkey_title': self.hotkey_title, 'tasks': self.tasks, 'keyword_arguments': self.keyword_arguments}
         return {key: value for key, value in resp.items() if value is not None and key not in self.deprecated_keys}
Original file line number	Diff line number	Diff line change
Expand Up		@@ -4,4 +4,4 @@
		from .streaming_client import StreamingClient


		__version__ = "1.3.2"
		__version__ = "1.3.3"