From 782645a6e157765387d9486ca35d394b29339ef8 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Wed, 30 Aug 2023 12:05:43 +0200 Subject: [PATCH 1/3] Merge projection selects too many columns --- dask_expr/_merge.py | 9 ++++++++- dask_expr/tests/test_merge.py | 12 ++++++++++++ 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/dask_expr/_merge.py b/dask_expr/_merge.py index cacc25c8..72574c83 100644 --- a/dask_expr/_merge.py +++ b/dask_expr/_merge.py @@ -203,7 +203,14 @@ def _simplify_up(self, parent): projection = [projection] left, right = self.left, self.right - left_on, right_on = self.left_on, self.right_on + if isinstance(self.left_on, list): + left_on = self.left_on + else: + left_on = [self.left_on] if self.left_on is not None else [] + if isinstance(self.right_on, list): + right_on = self.right_on + else: + right_on = [self.right_on] if self.right_on is not None else [] left_suffix, right_suffix = self.suffixes[0], self.suffixes[1] project_left, project_right = [], [] diff --git a/dask_expr/tests/test_merge.py b/dask_expr/tests/test_merge.py index 96365473..dacb92b3 100644 --- a/dask_expr/tests/test_merge.py +++ b/dask_expr/tests/test_merge.py @@ -164,3 +164,15 @@ def test_merge_len(): query = df.merge(df2).index.optimize(fuse=False) expected = df[["x"]].merge(df2[["x"]]).index.optimize(fuse=False) assert query._name == expected._name + + +def test_merge_optimize_subset_strings(): + pdf = lib.DataFrame({"a": [1, 2], "aaa": 1}) + pdf2 = lib.DataFrame({"b": [1, 2], "aaa": 1}) + df = from_pandas(pdf) + df2 = from_pandas(pdf2) + + query = df.merge(df2, on="aaa")[["aaa"]].optimize(fuse=False) + exp = df[["aaa"]].merge(df2[["aaa"]], on="aaa").optimize(fuse=False) + assert query._name == exp._name + assert_eq(query, pdf.merge(pdf2, on="aaa")[["aaa"]]) From e4de7bfbc0cf60a9c7f1169c93a39378c4ca4384 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Wed, 30 Aug 2023 18:51:45 +0200 Subject: [PATCH 2/3] Update --- dask_expr/_merge.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/dask_expr/_merge.py b/dask_expr/_merge.py index 72574c83..94b5ae00 100644 --- a/dask_expr/_merge.py +++ b/dask_expr/_merge.py @@ -9,6 +9,7 @@ from dask_expr._expr import Blockwise, Expr, Index, PartitionsFiltered, Projection from dask_expr._repartition import Repartition from dask_expr._shuffle import AssignPartitioningIndex, Shuffle, _contains_index_name +from dask_expr._util import _convert_to_list _HASH_COLUMN_NAME = "__hash_partition" @@ -203,14 +204,14 @@ def _simplify_up(self, parent): projection = [projection] left, right = self.left, self.right - if isinstance(self.left_on, list): - left_on = self.left_on - else: - left_on = [self.left_on] if self.left_on is not None else [] - if isinstance(self.right_on, list): - right_on = self.right_on - else: - right_on = [self.right_on] if self.right_on is not None else [] + left_on = _convert_to_list(self.left_on) + if left_on is None: + left_on = [] + + right_on = _convert_to_list(self.right_on) + if right_on is None: + right_on = [] + left_suffix, right_suffix = self.suffixes[0], self.suffixes[1] project_left, project_right = [], [] From 39f54e9ee2547ea5b379d66acca67532fe179238 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Wed, 30 Aug 2023 19:07:37 +0200 Subject: [PATCH 3/3] Remove unnecessary logic --- dask_expr/_merge.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dask_expr/_merge.py b/dask_expr/_merge.py index 94b5ae00..20e3fc81 100644 --- a/dask_expr/_merge.py +++ b/dask_expr/_merge.py @@ -217,7 +217,7 @@ def _simplify_up(self, parent): # Find columns to project on the left for col in left.columns: - if left_on is not None and col in left_on or col in projection: + if col in left_on or col in projection: project_left.append(col) elif f"{col}{left_suffix}" in projection: project_left.append(col) @@ -228,7 +228,7 @@ def _simplify_up(self, parent): # Find columns to project on the right for col in right.columns: - if right_on is not None and col in right_on or col in projection: + if col in right_on or col in projection: project_right.append(col) elif f"{col}{right_suffix}" in projection: project_right.append(col)