Skip to content

Commit

Permalink
Version 1.3.3
Browse files Browse the repository at this point in the history
  • Loading branch information
Austin Zielman committed May 30, 2024
1 parent 19c5599 commit 934b449
Show file tree
Hide file tree
Showing 473 changed files with 63,592 additions and 132,057 deletions.
2 changes: 1 addition & 1 deletion abacusai/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,4 @@
from .streaming_client import StreamingClient


__version__ = "1.3.2"
__version__ = "1.3.3"
1 change: 1 addition & 0 deletions abacusai/api_class/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from .ai_chat import *
from .batch_prediction import *
from .blob_input import *
from .connectors import *
from .dataset import *
from .dataset_application_connector import *
from .deployment import *
Expand Down
6 changes: 5 additions & 1 deletion abacusai/api_class/ai_agents.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ class WorkflowNodeInputSchema(ApiClass):
A react-jsonschema-form conformant schema for workflow node input.
Args:
json_schema (dict): The json schema for the input conformant to react-jsonschema-form specification. Must define keys like "title", "type" and "properties". Supported elements - Checkbox, Radio Button, Dropdown, Textarea, Number, Date, File Upload. Not supported - Nested elements, arrays and other complex types.
json_schema (dict): The json schema for the input conformant to react-jsonschema-form specification. Must define keys like "title", "type" and "properties". Supported elements - Checkbox, Radio Button, Dropdown, Textarea, Number, Date, file upload. Not supported - Nested elements, arrays and other complex types.
ui_schema (dict): The ui schema for the input conformant to react-jsonschema-form specification.
"""
json_schema: dict
Expand Down Expand Up @@ -99,6 +99,8 @@ def to_dict(self):

@classmethod
def from_dict(cls, mapping: dict):
if any(field not in mapping for field in ['name', 'variable_type']):
raise ValueError('Invalid workflow node input mapping. Must contain keys - name, variable_type')
return cls(
name=mapping['name'],
variable_type=mapping['variable_type'],
Expand Down Expand Up @@ -179,6 +181,8 @@ def to_dict(self):

@classmethod
def from_dict(cls, node: dict):
if any(field not in node for field in ['name', 'function_name', 'source_code', 'input_mappings', 'output_mappings']):
raise ValueError('Invalid workflow graph node. Must contain keys - name, function_name, source_code, input_mappings, output_mappings.')
return cls(
name=node['name'],
function_name=node['function_name'],
Expand Down
42 changes: 36 additions & 6 deletions abacusai/api_class/blob_input.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@


@dataclasses.dataclass
class BlobInput(ApiClass):
class Blob(ApiClass):
"""
Binary large object input data.
Binary large object data.
Args:
filename (str): The original filename of the blob.
Expand All @@ -20,14 +20,44 @@ class BlobInput(ApiClass):
mime_type: str
size: int

def __init__(self, contents: bytes, mime_type: str = None, filename: str = None, size: int = None):
if contents is None or not isinstance(contents, bytes):
raise ValueError('contents must be a valid bytes object')
if mime_type is None:
try:
if filename:
mime_type = mimetypes.guess_type(filename)[0]
else:
import magic
mime_type = magic.Magic(mime=True).from_buffer(contents)
except Exception:
pass
else:
if not isinstance(mime_type, str):
raise ValueError('mime_type must be a valid string')

self.filename = filename
self.contents = contents
self.mime_type = mime_type
self.size = size or len(contents)

@classmethod
def from_local_file(cls, file_path: str) -> 'BlobInput':
def from_local_file(cls, file_path: str) -> 'Blob':
with open(file_path, 'rb') as f:
contents = f.read()
return cls.from_contents(contents, filename=file_path)

@classmethod
def from_contents(cls, contents: bytes, filename: str = None, mime_type: str = None) -> 'BlobInput':
if not mime_type and filename:
mime_type = mimetypes.guess_type(filename)[0]
def from_contents(cls, contents: bytes, filename: str = None, mime_type: str = None) -> 'Blob':
return cls(filename=filename, contents=contents, mime_type=mime_type, size=len(contents))


@dataclasses.dataclass
class BlobInput(Blob):
"""
Binary large object data.
An alias for Blob, used to indicate that this is an input object.
"""

def __init__(self, filename: str = None, contents: bytes = None, mime_type: str = None, size: int = None):
super().__init__(contents, mime_type, filename, size)
2 changes: 1 addition & 1 deletion abacusai/api_class/connectors.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ class KafkaDatasetConfig(StreamingConnectorDatasetConfig):
Args:
topic (str): The kafka topic to consume
"""
topic: str
topic: str = dataclasses.field(default=None)

def __post_init__(self):
self.streaming_connector_type = enums.StreamingConnectorType.KAFKA
Expand Down
2 changes: 2 additions & 0 deletions abacusai/api_class/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ class DocumentProcessingConfig(ApiClass):
remove_header_footer (bool): Whether to remove headers and footers. Defaults to False. This option only takes effect when extract_bounding_boxes is True.
remove_watermarks (bool): Whether to remove watermarks. By default, it will be decided automatically based on the OCR mode and the document type. This option only takes effect when extract_bounding_boxes is True.
convert_to_markdown (bool): Whether to convert extracted text to markdown. Defaults to False. This option only takes effect when extract_bounding_boxes is True.
return_links (bool): Whether to augment the extracted text with embedded URLs in the PDF. Defaults to False. This option only takes effect when the document is a PDF.
"""
# NOTE: The defaults should match with clouddb.document_processing_results table defaults
extract_bounding_boxes: bool = False
Expand All @@ -50,6 +51,7 @@ class DocumentProcessingConfig(ApiClass):
remove_header_footer: bool = False
remove_watermarks: bool = True
convert_to_markdown: bool = False
return_links: bool = False


@dataclasses.dataclass
Expand Down
36 changes: 36 additions & 0 deletions abacusai/api_class/enums.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,17 @@ class MergeMode(ApiEnum):
TIME_WINDOW = 'TIME_WINDOW'


class OperatorType(ApiEnum):
UNPIVOT = 'UNPIVOT'
MARKDOWN = 'MARKDOWN'
CRAWLER = 'CRAWLER'
EXTRACT_DOCUMENT_DATA = 'EXTRACT_DOCUMENT_DATA'


class MarkdownOperatorInputType(ApiEnum):
HTML = 'HTML'


class FillLogic(ApiEnum):
# back / future
AVERAGE = 'average'
Expand Down Expand Up @@ -406,6 +417,7 @@ class VectorStoreTextEncoder(ApiEnum):
OPENAI = 'OPENAI'
SENTENCE_BERT = 'SENTENCE_BERT'
E5_SMALL = 'E5_SMALL'
CODE_BERT = 'CODE_BERT'


class LLMName(ApiEnum):
Expand Down Expand Up @@ -517,6 +529,7 @@ class OcrMode(ApiEnum):
COMPREHENSIVE = 'COMPREHENSIVE'
COMPREHENSIVE_V2 = 'COMPREHENSIVE_V2'
COMPREHENSIVE_TABLE_MD = 'COMPREHENSIVE_TABLE_MD'
TESSERACT_FAST = 'TESSERACT_FAST'


class StdDevThresholdType(ApiEnum):
Expand All @@ -535,6 +548,7 @@ class DataType(ApiEnum):
LIST = 'list'
STRUCT = 'struct'
NULL = 'null'
BINARY = 'binary'


class AgentInterface(ApiEnum):
Expand All @@ -545,4 +559,26 @@ class AgentInterface(ApiEnum):

class ProjectConfigType(ApiEnum):
CONSTRAINTS = 'CONSTRAINTS'
CHAT_FEEDBACK = 'CHAT_FEEDBACK'
REVIEW_MODE = 'REVIEW_MODE'


class CPUSize(ApiEnum):
SMALL = 'small'
MEDIUM = 'medium'
LARGE = 'large'


class MemorySize(ApiEnum):
SMALL = 16
MEDIUM = 32
LARGE = 64
XLARGE = 128

@classmethod
def from_value(cls, value):
sorted_members = sorted(cls, key=lambda mem: mem.value)
for member in sorted_members:
if member.value >= value:
return member
return None
111 changes: 107 additions & 4 deletions abacusai/api_class/feature_group.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

from . import enums
from .abstract import ApiClass, _ApiClassFactory
from .dataset import DocumentProcessingConfig


@dataclasses.dataclass
Expand All @@ -27,7 +28,6 @@ class NSamplingConfig(SamplingConfig):
The number of distinct values of the key columns to include in the sample, or number of rows if key columns not specified.
Args:
sampling_method (SamplingMethodType): N_SAMPLING
sample_count (int): The number of rows to include in the sample
key_columns (List[str]): The feature(s) to use as the key(s) when sampling
"""
Expand All @@ -44,7 +44,6 @@ class PercentSamplingConfig(SamplingConfig):
The fraction of distinct values of the feature group to include in the sample.
Args:
sampling_method (SamplingMethodType): PERCENT_SAMPLING
sample_percent (float): The percentage of the rows to sample
key_columns (List[str]): The feature(s) to use as the key(s) when sampling
"""
Expand Down Expand Up @@ -87,7 +86,6 @@ class LastNMergeConfig(MergeConfig):
Merge LAST N chunks/versions of an incremental dataset.
Args:
merge_mode (MergeMode): LAST_N
num_versions (int): The number of versions to merge. num_versions == 0 means merge all versions.
include_version_timestamp_column (bool): If set, include a column with the creation timestamp of source FG versions.
"""
Expand All @@ -104,7 +102,6 @@ class TimeWindowMergeConfig(MergeConfig):
Merge rows within a given timewindow of the most recent timestamp
Args:
merge_mode (MergeMode): TIME_WINDOW
feature_name (str): Time based column to index on
time_window_size_ms (int): Range of merged rows will be [MAX_TIME - time_window_size_ms, MAX_TIME]
include_version_timestamp_column (bool): If set, include a column with the creation timestamp of source FG versions.
Expand All @@ -125,3 +122,109 @@ class _MergeConfigFactory(_ApiClassFactory):
enums.MergeMode.LAST_N: LastNMergeConfig,
enums.MergeMode.TIME_WINDOW: TimeWindowMergeConfig,
}


@dataclasses.dataclass
class OperatorConfig(ApiClass):
"""Configuration for a template Feature Group Operation"""
operator_type: enums.OperatorType = dataclasses.field(default=None, repr=False, init=False)

@classmethod
def _get_builder(cls):
return _OperatorConfigFactory

def __post_init__(self):
if self.__class__ == OperatorConfig:
raise TypeError('Cannot instantiate abstract OperatorConfig class.')


@dataclasses.dataclass
class UnpivotConfig(OperatorConfig):
""" Unpivot Columns in a FeatureGroup.
Args:
columns (List[str]): Which columns to unpivot.
index_column (str): Name of new column containing the unpivoted column names as its values
value_column (str): Name of new column containing the row values that were unpivoted.
exclude (bool): If True, the unpivoted columns are all the columns EXCEPT the ones in the columns argument. Default is False.
"""

columns: List[str] = dataclasses.field(default=None)
index_column: str = dataclasses.field(default=None)
value_column: str = dataclasses.field(default=None)
exclude: bool = dataclasses.field(default=None)

def __post_init__(self):
self.operator_type = enums.OperatorType.UNPIVOT


@dataclasses.dataclass
class MarkdownConfig(OperatorConfig):
""" Transform a input column to a markdown column.
Args:
input_column (str): Name of input column to transform.
output_column (str): Name of output column to store transformed data.
input_column_type (MarkdownOperatorInputType): Type of input column to transform.
"""
input_column: str = dataclasses.field(default=None)
output_column: str = dataclasses.field(default=None)
input_column_type: enums.MarkdownOperatorInputType = dataclasses.field(default=None)

def __post_init__(self):
self.operator_type = enums.OperatorType.MARKDOWN


@dataclasses.dataclass
class CrawlerTransformConfig(OperatorConfig):
""" Transform a input column of urls to html text
Args:
input_column (str): Name of input column to transform.
output_column (str): Name of output column to store transformed data.
depth_column (str): Increasing depth explores more links, capturing more content
disable_host_restriction (bool): If True, will not restrict crawling to the same host.
honour_website_rules (bool): If True, will respect robots.txt rules.
user_agent (str): If provided, will use this user agent instead of randomly selecting one.
"""
input_column: str = dataclasses.field(default=None)
output_column: str = dataclasses.field(default=None)
depth_column: str = dataclasses.field(default=None)
input_column_type: str = dataclasses.field(default=None, metadata={'deprecated': True})
crawl_depth: int = dataclasses.field(default=None, metadata={'deprecated': True})
disable_host_restriction: bool = dataclasses.field(default=None)
honour_website_rules: bool = dataclasses.field(default=None)
user_agent: str = dataclasses.field(default=None)

def __post_init__(self):
self.operator_type = enums.OperatorType.CRAWLER


@dataclasses.dataclass
class ExtractDocumentDataConfig(OperatorConfig):
""" Extracts data from documents.
Args:
doc_id_column (str): Name of input document ID column.
document_column (str): Name of the input document column which contains the page infos. This column will be transformed to include the document processing config in the output feature group.
document_processing_config (DocumentProcessingConfig): Document processing configuration.
"""
doc_id_column: str = dataclasses.field(default=None)
document_column: str = dataclasses.field(default=None)
document_processing_config: DocumentProcessingConfig = dataclasses.field(default=None)

def __post_init__(self):
self.operator_type = enums.OperatorType.EXTRACT_DOCUMENT_DATA


@dataclasses.dataclass
class _OperatorConfigFactory(_ApiClassFactory):
"""A class to select and return the the correct type of Operator Config based on a serialized OperatorConfig instance. """
config_abstract_class = OperatorConfig
config_class_key = 'operator_type'
config_class_map = {
enums.OperatorType.UNPIVOT: UnpivotConfig,
enums.OperatorType.MARKDOWN: MarkdownConfig,
enums.OperatorType.CRAWLER: CrawlerTransformConfig,
enums.OperatorType.EXTRACT_DOCUMENT_DATA: ExtractDocumentDataConfig,
}
2 changes: 2 additions & 0 deletions abacusai/api_class/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,7 @@ class RegressionTrainingConfig(TrainingConfig):
custom_loss_functions (List[str]): Registered custom losses available for selection.
custom_metrics (List[str]): Registered custom metrics available for selection.
partial_dependence_analysis (PartialDependenceAnalysis): Specify whether to run partial dependence plots for all features or only some features.
do_masked_language_model_pretraining (bool): Specify whether to run a masked language model unsupervised pretraining step before supervized training in certain supported algorithms which use BERT-like backbones.
"""
objective: enums.RegressionObjective = dataclasses.field(default=None)
sort_objective: enums.RegressionObjective = dataclasses.field(default=None)
Expand Down Expand Up @@ -213,6 +214,7 @@ class RegressionTrainingConfig(TrainingConfig):
dropout_rate: int = dataclasses.field(default=None)
pretrained_model_name: str = dataclasses.field(default=None)
is_multilingual: bool = dataclasses.field(default=None)
do_masked_language_model_pretraining: bool = dataclasses.field(default=None)

# loss function
loss_function: enums.RegressionLossFunction = dataclasses.field(default=None)
Expand Down
12 changes: 7 additions & 5 deletions abacusai/chat_message.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,10 @@ class ChatMessage(AbstractApiClass):
docIds (list[str]): A list of IDs of the uploaded document if the message has
hotkeyTitle (str): The title of the hotkey prompt if the message has one
tasks (list[str]): The list of spawned tasks, if the message was broken down into smaller sub-tasks.
keywordArguments (dict): A dict of kwargs used to generate the response.
"""

def __init__(self, client, role=None, text=None, timestamp=None, isUseful=None, feedback=None, docIds=None, hotkeyTitle=None, tasks=None):
def __init__(self, client, role=None, text=None, timestamp=None, isUseful=None, feedback=None, docIds=None, hotkeyTitle=None, tasks=None, keywordArguments=None):
super().__init__(client, None)
self.role = role
self.text = text
Expand All @@ -27,11 +28,12 @@ def __init__(self, client, role=None, text=None, timestamp=None, isUseful=None,
self.doc_ids = docIds
self.hotkey_title = hotkeyTitle
self.tasks = tasks
self.keyword_arguments = keywordArguments
self.deprecated_keys = {}

def __repr__(self):
repr_dict = {f'role': repr(self.role), f'text': repr(self.text), f'timestamp': repr(self.timestamp), f'is_useful': repr(
self.is_useful), f'feedback': repr(self.feedback), f'doc_ids': repr(self.doc_ids), f'hotkey_title': repr(self.hotkey_title), f'tasks': repr(self.tasks)}
repr_dict = {f'role': repr(self.role), f'text': repr(self.text), f'timestamp': repr(self.timestamp), f'is_useful': repr(self.is_useful), f'feedback': repr(
self.feedback), f'doc_ids': repr(self.doc_ids), f'hotkey_title': repr(self.hotkey_title), f'tasks': repr(self.tasks), f'keyword_arguments': repr(self.keyword_arguments)}
class_name = "ChatMessage"
repr_str = ',\n '.join([f'{key}={value}' for key, value in repr_dict.items(
) if getattr(self, key, None) is not None and key not in self.deprecated_keys])
Expand All @@ -44,6 +46,6 @@ def to_dict(self):
Returns:
dict: The dict value representation of the class parameters
"""
resp = {'role': self.role, 'text': self.text, 'timestamp': self.timestamp, 'is_useful': self.is_useful,
'feedback': self.feedback, 'doc_ids': self.doc_ids, 'hotkey_title': self.hotkey_title, 'tasks': self.tasks}
resp = {'role': self.role, 'text': self.text, 'timestamp': self.timestamp, 'is_useful': self.is_useful, 'feedback': self.feedback,
'doc_ids': self.doc_ids, 'hotkey_title': self.hotkey_title, 'tasks': self.tasks, 'keyword_arguments': self.keyword_arguments}
return {key: value for key, value in resp.items() if value is not None and key not in self.deprecated_keys}
Loading

0 comments on commit 934b449

Please sign in to comment.