Skip to content

Commit

Permalink
Merge pull request #54 from mindsdb/fix/check_ts_length
Browse files Browse the repository at this point in the history
[Fix] Cast orderby to numerical, check TS length before finalizing split
  • Loading branch information
paxcema authored Apr 30, 2024
2 parents 22c707b + d07c086 commit e33b27c
Show file tree
Hide file tree
Showing 2 changed files with 11 additions and 0 deletions.
4 changes: 4 additions & 0 deletions dataprep_ml/cleaners.py
Original file line number Diff line number Diff line change
Expand Up @@ -507,6 +507,10 @@ def clean_timeseries(df: pd.DataFrame, tss: dict) -> pd.DataFrame:

# save original order of columns
orig_cols = deepcopy(df.columns.to_list())

# cast order_by as numerical
df[tss['order_by']] = pd.to_numeric(df[tss['order_by']], errors='raise')

# fix duplicates by group
if tss.get('group_by', False):
correct_dfs = []
Expand Down
7 changes: 7 additions & 0 deletions dataprep_ml/splitters.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,13 @@ def splitter(
else:
train, dev, test = simple_split(data, pct_train, pct_dev, pct_test)

# Final assertions for time series
if min(len(train), len(dev)) < tss.get('window', 1):
raise Exception(f"Dataset size is too small for the specified window size ({tss.get('window', 1)})")

if min(len(train), len(dev), len(test)) < tss.get('horizon', 1):
raise Exception(f"Dataset size is too small for the specified horizon size ({tss.get('horizon', 1)})")

return {"train": train, "test": test, "dev": dev, "stratified_on": stratify_on}


Expand Down

0 comments on commit e33b27c

Please sign in to comment.