From 2008ca8ca976499793ab0821113909d5de40bc8e Mon Sep 17 00:00:00 2001 From: Michael Terry Date: Wed, 29 Nov 2023 13:12:04 -0500 Subject: [PATCH] fix: fix naive timestamps and int types in duckdb Don't require a timezone with our parsed timestamps (otherwise, we can't parse a timestamp like YYYY-MM-DD). And make sure to ask Pandas to use modern nullable columns instead of coerced-float columns when there are nullable-int datasets (like you see if you have a powerset output table with an integer column). --- cumulus_library/databases.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/cumulus_library/databases.py b/cumulus_library/databases.py index 3e2bf2ef..aa436cf1 100644 --- a/cumulus_library/databases.py +++ b/cumulus_library/databases.py @@ -116,7 +116,7 @@ def __init__(self, db_file: str): "from_iso8601_timestamp", self._compat_from_iso8601_timestamp, None, - duckdb.typing.TIMESTAMP_TZ, + duckdb.typing.TIMESTAMP, ) def insert_tables(self, tables: dict[str, pyarrow.Table]) -> None: @@ -151,7 +151,11 @@ def cursor(self) -> duckdb.DuckDBPyConnection: return self.connection def execute_as_pandas(self, sql: str) -> pandas.DataFrame: - return self.connection.execute(sql).df() + # We call convert_dtypes here in case there are integer columns. + # Pandas will normally cast nullable-int as a float type unless + # we call this to convert to its nullable int column type. + # PyAthena seems to do this correctly for us, but not DuckDB. + return self.connection.execute(sql).df().convert_dtypes() def close(self) -> None: self.connection.close()