Skip to content

Commit

Permalink
linting, minor refactoring, merge PR#3, bump version
Browse files Browse the repository at this point in the history
  • Loading branch information
naqvis committed Mar 27, 2023
1 parent 62a965f commit caac92b
Show file tree
Hide file tree
Showing 16 changed files with 130 additions and 135 deletions.
4 changes: 4 additions & 0 deletions .ameba.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,7 @@ Metrics/CyclomaticComplexity:
MaxComplexity: 30
Enabled: true
Severity: Convention
Lint/NotNil:
Enabled: false
Style/VerboseBlock:
Enabled: false
2 changes: 1 addition & 1 deletion shard.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ authors:
description: |
Crystal Shard for data analysis , wrangling , munging.
crystal: ">= 0.36.0, < 2.0.0"
crystal: ~> 0.36.0

dependencies:
db:
Expand Down
18 changes: 9 additions & 9 deletions spec/column_spec.cr
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ module Crysda
"B", 2.5
)

cumsum_grp = sales.group_by("product").add_column("cum_sales".with { |v| v["sales"].cumsum })
cumsum_grp = sales.group_by("product").add_column("cum_sales".with(&.["sales"].cumsum))
cumsum_grp.tap do |grp|
grp.num_row.should eq(sales.num_row)
grp["cum_sales"][1].should eq(44.5)
Expand All @@ -91,8 +91,8 @@ module Crysda
)

pct_chng = sales.group_by("product")
.add_column("sales_pct_change".with { |v| v["sales"].pct_change })
.add_column("price_pct_change".with { |v| v["price"].pct_change })
.add_column("sales_pct_change".with(&.["sales"].pct_change))
.add_column("price_pct_change".with(&.["price"].pct_change))

pct_chng.tap do |df|
df.num_row.should eq(sales.num_row)
Expand All @@ -113,8 +113,8 @@ module Crysda
)

lead_lag = sales
.add_column("sales_lead".with { |v| v["sales"].lead })
.add_column("price_lag".with { |v| v["price"].lag(n: 2) })
.add_column("sales_lead".with(&.["sales"].lead))
.add_column("price_lag".with(&.["price"].lag(n: 2)))

lead_lag.tap do |df|
df.num_row.should eq(sales.num_row)
Expand Down Expand Up @@ -162,12 +162,12 @@ module Crysda
3, nil, "berlin",
4, 75, "berlin"
)
sales.add_column("lagged".with { |v| v["store"].lead(n: 1, default: "bla") })
sales.add_column("lagged".with(&.["store"].lead(n: 1, default: "bla")))
.tap do |df|
df["lagged"][-1].should eq("bla")
end
# test numeric (with int default to add a bit complexity)
sales.add_column("lagged".with { |v| v["quarter"].lead(default: 42) })
sales.add_column("lagged".with(&.["quarter"].lead(default: 42)))
.tap do |df|
df["lagged"][-1].should eq(42)
end
Expand All @@ -178,13 +178,13 @@ module Crysda
UUID.random,
UUID.random
)
df.add_column("prev_uuid".with { |v| v["uuid"].lag(default: "foo") })
df.add_column("prev_uuid".with(&.["uuid"].lag(default: "foo")))
.tap do |v|
v["prev_uuid"][0].should eq("foo")
end

uuid = UUID.random
df.add_column("prev_uuid".with { |v| v["uuid"].lag(default: uuid) })
df.add_column("prev_uuid".with(&.["uuid"].lag(default: uuid)))
.tap do |v|
v["prev_uuid"][0].should eq(uuid)
end
Expand Down
66 changes: 33 additions & 33 deletions spec/core_spec.cr
Original file line number Diff line number Diff line change
Expand Up @@ -25,28 +25,28 @@ module Crysda
end

it "should select with regex" do
SLEEP_DATA.select { |v| v.ends_with?("wt") }.num_col.should eq(2)
SLEEP_DATA.select { |v| v.starts_with?("sleep") }.num_col.should eq(3)
SLEEP_DATA.select { |v| v.list_of("conservation", "foobar", "order") }.num_col.should eq(2)
SLEEP_DATA.select(&.ends_with?("wt")).num_col.should eq(2)
SLEEP_DATA.select(&.starts_with?("sleep")).num_col.should eq(3)
SLEEP_DATA.select(&.list_of("conservation", "foobar", "order")).num_col.should eq(2)

SLEEP_DATA.select(Int32Col)
SLEEP_DATA.select? { |v| v.is_a?(Int32Col) }
SLEEP_DATA.select? { |v| v.name.starts_with?("foo") }
SLEEP_DATA.select?(&.is_a?(Int32Col))
SLEEP_DATA.select?(&.name.starts_with?("foo"))

IRIS_DATA.select(StringCol).names.should eq(["Species"])
end

it "should allow to remove columns" do
SLEEP_DATA.reject { |v| v.ends_with?("wt") }.num_col.should eq(9)
SLEEP_DATA.reject { |v| v.starts_with?("sleep") }.num_col.should eq(8)
SLEEP_DATA.reject { |v| v.list_of("conservation", "foobar", "order") }.num_col.should eq(9)
SLEEP_DATA.reject(&.ends_with?("wt")).num_col.should eq(9)
SLEEP_DATA.reject(&.starts_with?("sleep")).num_col.should eq(8)
SLEEP_DATA.reject(&.list_of("conservation", "foobar", "order")).num_col.should eq(9)

IRIS_DATA.reject(StringCol).num_col.should eq(4)
IRIS_DATA.reject? { |v| v.is_a?(StringCol) }.num_col.should eq(4)
IRIS_DATA.reject? { |v| v.name.starts_with?("Sepal") }.num_col.should eq(3)
IRIS_DATA.reject?(&.is_a?(StringCol)).num_col.should eq(4)
IRIS_DATA.reject?(&.name.starts_with?("Sepal")).num_col.should eq(3)

# also allow for negative selection (like in the context of gather)
IRIS_DATA.select { |e| e.except { |c| c.starts_with?("Sepal") } }.num_col.should eq(3)
IRIS_DATA.select(&.except(&.starts_with?("Sepal"))).num_col.should eq(3)
end

it "should not allow to select non-existing column" do
Expand All @@ -57,7 +57,7 @@ module Crysda

it "should allow to select no column" do
SLEEP_DATA.select([] of String).num_col.should eq(0)
IRIS_DATA.select { |e| e.starts_with?("bla") }.num_col.should eq(0)
IRIS_DATA.select(&.starts_with?("bla")).num_col.should eq(0)
end

it "should not allow to select columns twice" do
Expand All @@ -76,13 +76,13 @@ module Crysda
(SLEEP_DATA.names - ["name", "vore"]).should eq(df.names)
end

IRIS_DATA.select { |e| e.starts_with?("Sepal").not }.names.should eq(["Petal.Length", "Petal.Width", "Species"])
IRIS_DATA.select(&.starts_with?("Sepal").not).names.should eq(["Petal.Length", "Petal.Width", "Species"])
end

it "it should not allow a mixed negative and positive selection" do
# note: typically the user would perform a positive selection but in context like gather he needs a negative selection api as well
column_types(IRIS_DATA.select { |e| e.except("Species").and e.starts_with?("Sepal").not }).size.should eq(2)
column_types(IRIS_DATA.select { |e| e.except("Species").and e.except { |c| c.starts_with?("Sepal") } }).size.should eq(2)
column_types(IRIS_DATA.select { |e| e.except("Species").and e.except(&.starts_with?("Sepal")) }).size.should eq(2)

# but one must never mix positive and negative selection
expect_raises(InvalidColumnSelectException, "Mixing positive and negative selection does not have meaningful semantics and is not supported") do
Expand All @@ -91,12 +91,12 @@ module Crysda
end

it "should handle empty negative selections gracefully" do
IRIS_DATA.select { |e| e.except("") }
IRIS_DATA.select(&.except(""))
end

it "should allow to select with matchers in grouped df" do
IRIS_DATA.group_by("Species")
.select { |e| e.ends_with?("Length") }
.select(&.ends_with?("Length"))
.tap do |df|
df.names.should eq(["Species", "Sepal.Length", "Petal.Length"])
end
Expand Down Expand Up @@ -130,7 +130,7 @@ module Crysda
it "it should allow to use a new column in the same mutate call" do
SLEEP_DATA.add_columns(
"vore_new".with { |e| e["vore"] },
"vore_first_char".with { |e| e["vore"].map { |c| c.to_s[0].to_s } }
"vore_first_char".with { |e| e["vore"].map(&.to_s[0].to_s) }
)
end

Expand All @@ -145,7 +145,7 @@ module Crysda

it "it should gracefully reject incorrect type casts" do
expect_raises(Exception) do
SLEEP_DATA.add_column("foo") { |e| e["vore"].as_i }
SLEEP_DATA.add_column("foo", &.["vore"].as_i)
end
end

Expand Down Expand Up @@ -237,16 +237,16 @@ module Crysda
.sample_frac(0.5)
.count("vore")
.filter { |e| e["vore"] == "omni" }
.tap { |e| e["n"].as_i.first.should eq(10) }
.tap(&.["n"].as_i.first.should eq(10))
end

it "should filter rows with text matching helpers" do
SLEEP_DATA.filter { |e| e["vore"].matching { |m| m == "insecti" } }.num_row.should eq(5)
SLEEP_DATA.filter { |e| e["vore"].matching { |m| m.starts_with?("ins") } }.num_row.should eq(5)
SLEEP_DATA.filter { |e| e["vore"].matching(&.== "insecti") }.num_row.should eq(5)
SLEEP_DATA.filter { |e| e["vore"].matching(&.starts_with?("ins")) }.num_row.should eq(5)

df = dataframe_of("x").values(1, 2, 3, 4, 5, nil)
df.filter { |e| e["x"] > 2 }.tap do |fi|
fi.filter { |a| a.is_na("x") }.num_row.should eq(0)
fi.filter(&.is_na("x")).num_row.should eq(0)
fi.num_row.should eq(3)
end

Expand Down Expand Up @@ -349,23 +349,23 @@ module Crysda
end

it "count should work with function literals" do
SLEEP_DATA.add_columns("sleep_na".with { |e| e["sleep_rem"].is_na }).count("sleep_na")
SLEEP_DATA.add_columns("sleep_na".with(&.["sleep_rem"].is_na)).count("sleep_na")

# should be equivalent to
SLEEP_DATA.group_by_expr(TableExpression.new { |e| e["sleep_rem"].is_na }).count.print
SLEEP_DATA.group_by_expr(TableExpression.new(&.["sleep_rem"].is_na)).count.print
SLEEP_DATA.group_by_expr(
TableExpression.new { |e| e["sleep_rem"].is_na },
TableExpression.new { |e| e["sleep_rem"].is_na },
TableExpression.new(&.["sleep_rem"].is_na),
TableExpression.new(&.["sleep_rem"].is_na),
).count.print
SLEEP_DATA.group_by_expr.count.print
end

it "summarize multiple columns at once with summarize_at" do
IRIS_DATA.summarize_at(
ColumnSelector.new { |e| e.starts_with?("Sepal") },
ColumnSelector.new(&.starts_with?("Sepal")),
SummarizeFunc.new do |s|
s.add(SumFormula.new { |e| e.mean }, "mean")
s.add(SumFormula.new { |e| e.median }, "median")
s.add(SumFormula.new(&.mean), "mean")
s.add(SumFormula.new(&.median), "median")
end
).tap do |df|
df.print
Expand All @@ -375,10 +375,10 @@ module Crysda

# using variadic arguments
IRIS_DATA.summarize_at(
ColumnSelector.new { |e| e.ends_with?("Length") },
ColumnSelector.new(&.ends_with?("Length")),
AggFuncs.mean,
# AggFuncs.median,
AggFunc.new(SumFormula.new { |c| c.median }, "median")
AggFunc.new(SumFormula.new(&.median), "median")
).tap do |df|
df.print
df.num_row.should eq(1)
Expand All @@ -389,7 +389,7 @@ module Crysda
it "summarize multiple columns in grouped data frame with summarize_at" do
IRIS_DATA.group_by("Species")
.summarize_at(
ColumnSelector.new { |e| e.ends_with?("Length") },
ColumnSelector.new(&.ends_with?("Length")),
AggFuncs.mean
).tap do |df|
df.print
Expand Down Expand Up @@ -538,7 +538,7 @@ STR
dfb = df.select("age", "last_name", "weight", "first_name")

# by joining with multiple attributes we inherentily group (which is the actual test
df.left_join(dfb, by: ["last_name", "first_name"]).tap { |v| v.num_row.should eq(3) }
df.left_join(dfb, by: ["last_name", "first_name"]).tap(&.num_row.should eq(3))
end

it "it should group tables with object columns and by object column" do
Expand Down
16 changes: 8 additions & 8 deletions spec/reshape_spec.cr
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ module Crysda
wide_data = data.map_with_index { |d, i| Float64Col.new(i.to_s, d).as(DataCol) }.bind_cols
.add_row_number("y")

wide_data.gather("x", "pixel_value", ColumnSelector.new { |x| x.except("y") }).tap do |df|
wide_data.gather("x", "pixel_value", ColumnSelector.new(&.except("y"))).tap do |df|
df.print
column_types(df)[2].type.should eq("Float64")
df.names.should eq(["y", "x", "pixel_value"])
Expand All @@ -84,7 +84,7 @@ module Crysda
"Anna", Address.new("Mueller Street", "New York"),
Address.new("Stresemannplatz", "Munich")
)
data.gather("type", "address", ColumnSelector.new { |x| x.ends_with?("address") }).tap do |df|
data.gather("type", "address", ColumnSelector.new(&.ends_with?("address"))).tap do |df|
df.schema
df.num_col.should eq(3)
df.names.should eq(["name", "type", "address"])
Expand All @@ -103,9 +103,9 @@ module Crysda

wide_df.gather("property", "value", ColumnSelector.new { |x| (x.except("person")).and x.starts_with?("person") })

wide_df.gather("property", "value", ColumnSelector.new { |x| x.except("person") })
wide_df.gather("property", "value", ColumnSelector.new(&.except("person")))

wide_df.gather("property", "value", ColumnSelector.new { |x| x.except("person") })
wide_df.gather("property", "value", ColumnSelector.new(&.except("person")))
.tap do |wf|
wf.print
annual_salary = wf.filter { |x| (x["person"] == "anna").and(x["property"] == "salary") }
Expand Down Expand Up @@ -152,7 +152,7 @@ ERR
df["test"].size.should eq(df.num_row)
end

united = SLEEP_DATA.unite("test", ColumnSelector.new { |c| c.list_of(["name", "sleep_rem"]) }, sep: ",")
united = SLEEP_DATA.unite("test", ColumnSelector.new(&.list_of(["name", "sleep_rem"])), sep: ",")

united.separate("test", ["new_name", "new_sleep_rem"], convert: true, sep: ",").tap do |df|
df.take.print
Expand Down Expand Up @@ -182,7 +182,7 @@ ERR
end

it "nest selected columns only" do
IRIS_DATA.nest(ColumnSelector.new { |c| c.except("Species") }).tap do |df|
IRIS_DATA.nest(ColumnSelector.new(&.except("Species"))).tap do |df|
df.schema
df.num_row.should eq 3
df.num_col.should eq 2
Expand All @@ -193,7 +193,7 @@ ERR
it "should unnest data" do
# use other small but NA-heavy data set here
restored = SLEEP_DATA
.nest(ColumnSelector.new { |c| c.except("order") })
.nest(ColumnSelector.new(&.except("order")))
.unnest(DataFrame::DEF_NEST_COLUMN_NAME)
.sort_by("order")
.move_left("name", "genus", "vore")
Expand Down Expand Up @@ -237,7 +237,7 @@ ERR
d.print
d.num_row.should eq(6)
d.num_col.should eq(4)
d.filter { |f| f["weight"].is_na }.num_row.should eq(3)
d.filter(&.["weight"].is_na).num_row.should eq(3)
end

# next steps in here: implement test nesting support ...
Expand Down
1 change: 0 additions & 1 deletion src/crysda.cr
Original file line number Diff line number Diff line change
Expand Up @@ -132,4 +132,3 @@ module Crysda
end

require "./**"

Loading

0 comments on commit caac92b

Please sign in to comment.