diff --git a/docs/index.html b/docs/index.html index dc706ed..4ff81be 100644 --- a/docs/index.html +++ b/docs/index.html @@ -184,7 +184,8 @@
Status: Some parts of the underlying libraries are stable. Some part of Noj are still experimental, and the API will change. These details should be clarified soon.
-See the standalone repo example: Noj - getting started - from raw data to a blog post
+#uuid "a94be0fe-cf0d-4958-911b-91eb334d6b71" {:model-data {:majority-class 1.0, :distinct-labels (0.0 1.0)}, :options {:model-type :metamorph.ml/dummy-classifier}, :id #uuid "79d462b8-51cf-4cd8-841f-eec3177b8560", :feature-columns [:sex :pclass :embarked], :target-columns [:survived], :target-categorical-maps {:survived #tech.v3.dataset.categorical.CategoricalMap{:lookup-table {0 0, 1 1}, :src-column :survived, :result-datatype :float64}}, :scicloj.metamorph.ml/unsupervised? nil}
}
+:metamorph/mode :fit
#uuid "fb791a68-7b46-4e12-ab0c-f9ac09b143d6" {:model-data {:majority-class 0.0, :distinct-labels (1.0 0.0)}, :options {:model-type :metamorph.ml/dummy-classifier}, :id #uuid "9dd62816-15e5-4d7a-9878-03a838ec6329", :feature-columns [:sex :pclass :embarked], :target-columns [:survived], :target-categorical-maps {:survived #tech.v3.dataset.categorical.CategoricalMap{:lookup-table {0 0, 1 1}, :src-column :survived, :result-datatype :float64}}, :scicloj.metamorph.ml/unsupervised? nil}
}
keys ctx-after-train) (
:metamorph/data
(:metamorph/mode
- "a94be0fe-cf0d-4958-911b-91eb334d6b71") #uuid
This context map has the “data”, the “mode” and an UUID for each operation (we had only one in this pipeline)
:fit
-
{:model-data {:majority-class 1.0, :distinct-labels (0.0 1.0)},
+
{:model-data {:majority-class 0.0, :distinct-labels (1.0 0.0)},
:options {:model-type :metamorph.ml/dummy-classifier},
- :id #uuid "79d462b8-51cf-4cd8-841f-eec3177b8560",
+ :id #uuid "9dd62816-15e5-4d7a-9878-03a838ec6329",
:feature-columns [:sex :pclass :embarked],
:target-columns [:survived],
:target-categorical-maps
@@ -688,70 +688,70 @@
-1.0
+0.0
-1.0
+0.0
-1.0
+0.0
-1.0
+0.0
-1.0
+0.0
-1.0
+0.0
-1.0
+0.0
-1.0
+0.0
-1.0
+0.0
-1.0
+0.0
...
-1.0
+0.0
-1.0
+0.0
-1.0
+0.0
-1.0
+0.0
-1.0
+0.0
-1.0
+0.0
-1.0
+0.0
-1.0
+0.0
-1.0
+0.0
-1.0
+0.0
-1.0
+0.0
@@ -769,7 +769,7 @@
-#uuid "a94be0fe-cf0d-4958-911b-91eb334d6b71"
+#uuid "fb791a68-7b46-4e12-ab0c-f9ac09b143d6"
@@ -810,38 +810,38 @@
-1.0
-3.0
+0.0
+2.0
0.0
0.0
-3.0
-0.0
+1.0
+1.0
0.0
3.0
-0.0
+1.0
0.0
-1.0
+3.0
0.0
1.0
-3.0
+2.0
0.0
-0.0
-3.0
+1.0
+2.0
0.0
+0.0
1.0
-3.0
0.0
@@ -850,14 +850,14 @@ 0.0
-0.0
-2.0
+1.0
+3.0
0.0
-1.0
-1.0
0.0
+1.0
+2.0
...
@@ -865,58 +865,58 @@ ...
-1.0
-1.0
-0.0
-
-
0.0
3.0
0.0
-
-0.0
-2.0
-0.0
-
-0.0
-3.0
-0.0
+1.0
+1.0
+2.0
-0.0
+1.0
3.0
2.0
1.0
1.0
-2.0
+0.0
+0.0
1.0
-2.0
0.0
1.0
-2.0
+3.0
0.0
-1.0
-1.0
+0.0
+3.0
0.0
1.0
+3.0
+0.0
+
+
+0.0
+2.0
+0.0
+
+
+0.0
1.0
2.0
-1.0
-1.0
+0.0
+3.0
2.0
@@ -927,10 +927,10 @@ :model-data {:majority-class 1.0, :distinct-labels (0.0 1.0)}
+:model-data {:majority-class 0.0, :distinct-labels (1.0 0.0)}
-:id #uuid "79d462b8-51cf-4cd8-841f-eec3177b8560"
+:id #uuid "9dd62816-15e5-4d7a-9878-03a838ec6329"
@@ -953,7 +953,7 @@
-0.0
+1.0
0.0
@@ -962,22 +962,22 @@ 0.0
-1.0
+0.0
1.0
-0.0
+1.0
0.0
-1.0
+0.0
-0.0
+1.0
1.0
@@ -986,37 +986,37 @@ ...
-1.0
+0.0
-0.0
+1.0
0.0
-0.0
+1.0
0.0
-1.0
+0.0
-1.0
+0.0
1.0
-1.0
+0.0
1.0
-1.0
+0.0
@@ -1045,7 +1045,7 @@
;float64>[178]
#tech.v3.dataset.column<:survived
-1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000...] [
+0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000...] [
This works as long as all operations of the pipeline follow the metamorph convention (we can create such compliant functions, out of normal dataset->dataset functions, as we will see)
my-pipeline
represents therefore a not yet executed model training / prediction. It can be freely moved around and applied to a dataset when needed.
@@ -1229,7 +1229,7 @@ :metamorph/mode :fit#uuid "59740c08-2901-4a4e-947c-6ca3a6882e60" {:model-data {:majority-class 1, :distinct-labels (0.0 1.0)}, :options {:model-type :metamorph.ml/dummy-classifier}, :id #uuid "47579ce8-ce49-4a37-a203-a6c1f8cf7d57", :feature-columns [:sex :pclass :embarked], :target-columns [:survived], :target-categorical-maps {:survived #tech.v3.dataset.categorical.CategoricalMap{:lookup-table {0 0, 1 1}, :src-column :survived, :result-datatype :float64}}, :scicloj.metamorph.ml/unsupervised? nil}
}
+:metamorph/mode :fit
#uuid "2e378827-56f1-45ff-8907-3e0bd19e48b3" {:model-data {:majority-class 1, :distinct-labels (0.0 1.0)}, :options {:model-type :metamorph.ml/dummy-classifier}, :id #uuid "a88653e6-ad63-4bc8-b06f-4fdc3fd43e89", :feature-columns [:sex :pclass :embarked], :target-columns [:survived], :target-categorical-maps {:survived #tech.v3.dataset.categorical.CategoricalMap{:lookup-table {0 0, 1 1}, :src-column :survived, :result-datatype :float64}}, :scicloj.metamorph.ml/unsupervised? nil}
}
To show the power of pipelines, I start with doing the simplest possible pipeline, and expand then on it.
we can already chain train and test with usual functions:
@@ -1242,7 +1242,7 @@
;float64>[178]
#tech.v3.dataset.column<:survived
-1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000...] [
+0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000...]
[
the same with pipelines
@@ -1258,7 +1258,7 @@
;float64>[178]
#tech.v3.dataset.column<:survived
-1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000...] [
+0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000...] [
@@ -1455,16 +1455,6 @@
-[:sex :pclass]
-0.78633911
-{:model-type :smile.classification/random-forest}
-
-
-[:sex :embarked]
-0.78633276
-{:model-type :smile.classification/ada-boost}
-
-
[:sex]
0.78633276
{:model-type :smile.classification/ada-boost}
@@ -1490,10 +1480,20 @@
+[:sex :pclass]
+0.78521551
+{:model-type :smile.classification/random-forest}
+
+
[:sex :pclass :embarked]
-0.78298737
+0.77849299
{:model-type :smile.classification/random-forest}
+
+[:sex :pclass]
+0.77734400
+{:model-type :smile.classification/ada-boost}
+
[:sex :pclass :embarked]
0.77507776
@@ -1505,15 +1505,15 @@
-[:pclass :embarked]
-0.63783406
-{:model-type :smile.classification/random-forest}
-
-
[:embarked]
0.63780232
{:model-type :smile.classification/logistic-regression}
+
+[:pclass :embarked]
+0.63670412
+{:model-type :smile.classification/random-forest}
+
[:embarked]
0.63218435
diff --git a/docs/noj_book.interactions_ols.html b/docs/noj_book.interactions_ols.html
index d39f3cb..418b759 100644
--- a/docs/noj_book.interactions_ols.html
+++ b/docs/noj_book.interactions_ols.html
@@ -342,19 +342,19 @@
Residuals:1Q Median 3Q Max
- Min 10.4530 -1.1270 0.4041 1.5434 3.5116
+ -7.0837 -0.7686 0.2551 1.3391 3.1244
-
Coefficients:>|t|)
- Estimate Std. Error t value Pr(3.3370 0.4750 7.0255 0.0000 ***
- Intercept 0.0458 0.0018 25.4854 0.0000 ***
- youtube 0.1893 0.0105 18.0913 0.0000 ***
+ facebook 4.0113 0.4016 9.9873 0.0000 ***
+ Intercept 0.0432 0.0016 26.2031 0.0000 ***
+ youtube 0.1919 0.0092 20.8554 0.0000 ***
facebook
---------------------------------------------------------------------0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Significance codes:
-2.1938 on 130 degrees of freedom
- Residual standard error: 0.8818, Adjusted R-squared: 0.8800
- Multiple R-squared: 485.1281 on 3 and 130 DF, p-value: 5.119e-61
+ F-statistic: 1.9175 on 130 degrees of freedom
+ Residual standard error: 0.9066, Adjusted R-squared: 0.9051
+ Multiple R-squared: 630.7411 on 3 and 130 DF, p-value: 1.203e-67
F-statistic:
We have the following metrics:
\(RMSE\)
@@ -362,14 +362,14 @@ -> evaluations flatten first :test-transform :metric) (
-1.637953126356843
+2.265017256746244
\(R^2\)
-> evaluations flatten first :test-transform :other-metrices first :metric) (
-0.9322495403155694
+0.8844252104035826
@@ -401,20 +401,20 @@
Residuals:1Q Median 3Q Max
- Min 7.3257 -0.4254 0.2263 0.7205 1.6955
+ -7.2707 -0.4485 0.2310 0.7650 1.7510
-
Coefficients:>|t|)
- Estimate Std. Error t value Pr(8.3738 0.3887 21.5428 0.0000 ***
- Intercept 0.0180 0.0019 9.3218 0.0000 ***
- youtube 0.0172 0.0112 1.5405 0.1259
- facebook 0.0009 0.0001 17.4617 0.0000 ***
+ youtube*facebook 8.1095 0.3905 20.7660 0.0000 ***
+ Intercept 0.0194 0.0020 9.6959 0.0000 ***
+ youtube 0.0216 0.0114 1.8930 0.0606 .
+ facebook 0.0009 0.0001 15.9876 0.0000 ***
youtube*facebook
---------------------------------------------------------------------0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Significance codes:
-1.1550 on 129 degrees of freedom
- Residual standard error: 0.9683, Adjusted R-squared: 0.9675
- Multiple R-squared: 1312.8955 on 4 and 129 DF, p-value: 1.919e-96
+ F-statistic: 1.2222 on 129 degrees of freedom
+ Residual standard error: 0.9615, Adjusted R-squared: 0.9607
+ Multiple R-squared: 1075.2548 on 4 and 129 DF, p-value: 4.777e-91
F-statistic:
As the multiplcation of youtube*facebook
is as well statistically relevant, it suggests that there is indeed an interaction between these 2 predictor variables youtube and facebook.
\(RMSE\)
@@ -422,14 +422,14 @@ -> evaluations flatten first :test-transform :metric) (
-1.1052884033088548
+0.9588610422624989
\(R^2\)
-> evaluations flatten first :test-transform :other-metrices first :metric) (
-0.9659333629686162
+0.977610159246305
\(RMSE\) and \(R^2\) of the intercation model are sligtly better.
These results suggest that the model with the interaction term is better than the model that contains only main effects. So, for this specific data, we should go for the model with the interaction model.
diff --git a/docs/noj_book.ml_basic.html b/docs/noj_book.ml_basic.html
index 158a4b7..edd9e4c 100644
--- a/docs/noj_book.ml_basic.html
+++ b/docs/noj_book.ml_basic.html
@@ -1176,9 +1176,9 @@ reverse)
-:predictor :sex, :importance 42.52097292426236}
- ({:predictor :pclass, :importance 10.164600856439982}
- {:predictor :embarked, :importance 1.8005940664569304}) {
+:predictor :sex, :importance 44.05095738906217}
+ ({:predictor :pclass, :importance 9.824296073595015}
+ {:predictor :embarked, :importance 1.6441426390534277}) {
we can see that :sex is more important to predict :survived then :pclass and :embark
diff --git a/docs/noj_book.python.html b/docs/noj_book.python.html
index bf07113..dc40868 100644
--- a/docs/noj_book.python.html
+++ b/docs/noj_book.python.html
@@ -294,14 +294,14 @@ :x sine-data)
:y sine-data)))
-
+
(
(vis.python/pyplot
#(matplotlib.pyplot/plot:x sine-data)
(:y sine-data))) (
-
+
(https://seaborn.pydata.org/tutorial/introduction
let [tips (sns/load_dataset "tips")]
@@ -315,7 +315,7 @@ (:style "smoker"
:size "size")))
-
+
:bye
diff --git a/docs/noj_book.statistics.html b/docs/noj_book.statistics.html
index c904500..d295677 100644
--- a/docs/noj_book.statistics.html
+++ b/docs/noj_book.statistics.html
@@ -395,12 +395,12 @@
:target-columns [:sepal-length],
:explained #function[clojure.lang.AFunction/1],
:R2 0.8582120394596505,
- :id #uuid "09969a91-5e57-4ee3-aea0-d3d9bf24a215",
+ :id #uuid "4d455060-963a-4792-a46d-8680d97643c8",
:predictions #tech.v3.dataset.column<float64>[150]
:sepal-length
5.022, 4.724, 4.775, 4.851, 5.081, 5.360, 4.911, 5.030, 4.664, 4.903, 5.209, 5.098, 4.775, 4.572, 5.184, 5.522, 5.089, 4.970, 5.352, 5.217...],
[:predict
- 63683],
+ #function[scicloj.noj.v1.stats/regression-model/predict--60541],
#function[scicloj.noj.v1.stats/regression-model/predict--:options {:model-type :smile.regression/elastic-net}}
@@ -416,12 +416,12 @@
:target-columns [:sepal-length],
:explained #function[clojure.lang.AFunction/1],
:R2 0.8586117200663171,
- :id #uuid "18236b22-7811-472d-97cb-72146a8e1c5a",
+ :id #uuid "7e2014b4-28fc-4147-abeb-73b06bd33c87",
:predictions #tech.v3.dataset.column<float64>[150]
:sepal-length
5.015, 4.690, 4.749, 4.826, 5.080, 5.377, 4.895, 5.021, 4.625, 4.882, 5.216, 5.092, 4.746, 4.533, 5.199, 5.561, 5.094, 4.960, 5.368, 5.226...],
[:predict
- 63683],
+ #function[scicloj.noj.v1.stats/regression-model/predict--60541],
#function[scicloj.noj.v1.stats/regression-model/predict--:options {:model-type :smile.regression/ordinary-least-square}}
The stats/linear-regression-model
convenience function uses specifically the :smile.regression/ordinary-least-square
model type.
@@ -437,12 +437,12 @@
:target-columns [:sepal-length],
:explained #function[clojure.lang.AFunction/1],
:R2 0.8586117200663171,
- :id #uuid "a0a398ca-862a-4568-892c-4558ceb9bafb",
+ :id #uuid "75e25455-6a0e-4174-9ac6-24ba37c19d88",
:predictions #tech.v3.dataset.column<float64>[150]
:sepal-length
5.015, 4.690, 4.749, 4.826, 5.080, 5.377, 4.895, 5.021, 4.625, 4.882, 5.216, 5.092, 4.746, 4.533, 5.199, 5.561, 5.094, 4.960, 5.368, 5.226...],
[:predict
- 63683],
+ #function[scicloj.noj.v1.stats/regression-model/predict--60541],
#function[scicloj.noj.v1.stats/regression-model/predict--:options {:model-type :smile.regression/ordinary-least-square}}
@@ -703,7 +703,7 @@ :target-columns [:sepal-length],
:explained #function[clojure.lang.AFunction/1],
:R2 0.8586117200663171,
- :id #uuid "ea4f1683-8c46-44b3-bec2-02f19d9fb7c7",
+ :id #uuid "7650009a-4dc7-4aa4-9886-738257925fb4",
:options {:model-type :smile.regression/ordinary-least-square}}}
@@ -726,29 +726,29 @@
-28
-0.00802265
-0.20465354
+20
+0.00050999
+0.19394683
-18
-0.20465354
-0.40128444
+14
+0.19394683
+0.38738366
-17
-0.40128444
-0.59791533
+19
+0.38738366
+0.58082049
-19
-0.59791533
-0.79454622
+21
+0.58082049
+0.77425733
-17
-0.79454622
-0.99117712
+25
+0.77425733
+0.96769416
diff --git a/docs/noj_book.visualization.html b/docs/noj_book.visualization.html
index 29a8463..e8513c7 100644
--- a/docs/noj_book.visualization.html
+++ b/docs/noj_book.visualization.html
@@ -310,7 +310,7 @@ :height 300,
:data
:values
- {"x,y\n0,-0.004207939587986709\n1,-0.055789133080412334\n2,-0.2768589633756995\n3,-0.22201979781606618\n4,0.24555568601114297\n5,0.44839725879571757\n6,0.359213993628024\n7,0.7215768669433458\n8,0.358897975524867\n9,-0.12917456247823011\n10,-0.26944984279344497\n11,-0.6118165371009159\n12,-1.111534335097509\n13,-1.010186411979333\n14,-1.3058657472829194\n15,-1.5044117735506326\n16,-1.6613913902571045\n17,-1.278309667469086\n18,-1.5370082224387598\n19,-1.3880203897513188\n",
+ "x,y\n0,-0.1325648359179058\n1,-0.27990686413394583\n2,-0.11718484436161825\n3,-0.2462438952131002\n4,-0.7000742338173375\n5,-0.8118999614681428\n6,-0.34004919446904114\n7,-0.45753462191489835\n8,-0.09561111731129412\n9,0.3586051405571238\n10,0.6155968621550317\n11,0.8444353375346749\n12,0.5098834495225566\n13,0.653697725279412\n14,0.5704985315016169\n15,0.51532920679822\n16,0.02004226065388337\n17,-0.11893807448086313\n18,0.18608707238769473\n19,0.3880074626880815\n",
:format {:type "csv"}}}
diff --git a/docs/noj_book.visualization_files/0.csv b/docs/noj_book.visualization_files/0.csv
index ce1f695..b5cc4b0 100644
--- a/docs/noj_book.visualization_files/0.csv
+++ b/docs/noj_book.visualization_files/0.csv
@@ -1,21 +1,21 @@
x,y
-0,-0.004207939587986709
-1,-0.055789133080412334
-2,-0.2768589633756995
-3,-0.22201979781606618
-4,0.24555568601114297
-5,0.44839725879571757
-6,0.359213993628024
-7,0.7215768669433458
-8,0.358897975524867
-9,-0.12917456247823011
-10,-0.26944984279344497
-11,-0.6118165371009159
-12,-1.111534335097509
-13,-1.010186411979333
-14,-1.3058657472829194
-15,-1.5044117735506326
-16,-1.6613913902571045
-17,-1.278309667469086
-18,-1.5370082224387598
-19,-1.3880203897513188
+0,-0.1325648359179058
+1,-0.27990686413394583
+2,-0.11718484436161825
+3,-0.2462438952131002
+4,-0.7000742338173375
+5,-0.8118999614681428
+6,-0.34004919446904114
+7,-0.45753462191489835
+8,-0.09561111731129412
+9,0.3586051405571238
+10,0.6155968621550317
+11,0.8444353375346749
+12,0.5098834495225566
+13,0.653697725279412
+14,0.5704985315016169
+15,0.51532920679822
+16,0.02004226065388337
+17,-0.11893807448086313
+18,0.18608707238769473
+19,0.3880074626880815
diff --git a/docs/search.json b/docs/search.json
index f283815..80ca0d0 100644
--- a/docs/search.json
+++ b/docs/search.json
@@ -9,12 +9,22 @@
"1 Preface"
]
},
+ {
+ "objectID": "index.html#getting-started",
+ "href": "index.html#getting-started",
+ "title": "Noj Documentation",
+ "section": "1.1 Getting started",
+ "text": "1.1 Getting started\nSee the standalone repo example: Noj - getting started - from raw data to a blog post",
+ "crumbs": [
+ "1 Preface"
+ ]
+ },
{
"objectID": "index.html#existing-chapters-in-this-book",
"href": "index.html#existing-chapters-in-this-book",
"title": "Noj Documentation",
- "section": "1.1 Existing chapters in this book:",
- "text": "1.1 Existing chapters in this book:\n\nMachine learning specific functionality in tech.ml.dataset\nMachine learning\nAutoML using metamorph pipelines\nOrdinary least squares with interactions\nDatasets\nPython (experimental 🛠)\nStatistics (experimental 🛠)\nVisualization (experimental 🛠)\nStatistical Visualization (experimental 🛠)\nMore visualization examples (experimental 🛠)\n\n\nsource: notebooks/index.clj",
+ "section": "1.2 Existing chapters in this book:",
+ "text": "1.2 Existing chapters in this book:\n\nMachine learning specific functionality in tech.ml.dataset\nMachine learning\nAutoML using metamorph pipelines\nOrdinary least squares with interactions\nDatasets\nPython (experimental 🛠)\nStatistics (experimental 🛠)\nVisualization (experimental 🛠)\nStatistical Visualization (experimental 🛠)\nMore visualization examples (experimental 🛠)\n\n\nsource: notebooks/index.clj",
"crumbs": [
"1 Preface"
]
@@ -134,7 +144,7 @@
"href": "noj_book.ml_basic.html#random-forrest",
"title": "3 Machine learning",
"section": "3.5 Random forrest",
- "text": "3.5 Random forrest\nNext is random forrest\n\n(def rf-model (ml/train (:train split) {:model-type :smile.classification/random-forest}))\n\n\n(def rf-prediction\n (ml/predict (:test split) rf-model))\n\n\n(loss/classification-accuracy\n (:survived (ds-cat/reverse-map-categorical-xforms (:test split)))\n (:survived (ds-cat/reverse-map-categorical-xforms rf-prediction)))\n\n\n0.7676767676767677\n\nbest so far, 71 %\nFrom the logistic regression model we can get via java Interop some model explanations, for example the variable importance.\n\n(->>\n (map\n (fn [predictor importance]\n (hash-map :predictor (-> predictor str csk/->kebab-case-keyword)\n :importance importance))\n\n (-> rf-model ml/thaw-model .formula .predictors)\n (-> rf-model ml/thaw-model .importance))\n (sort-by :importance)\n reverse)\n\n\n({:predictor :sex, :importance 42.52097292426236}\n {:predictor :pclass, :importance 10.164600856439982}\n {:predictor :embarked, :importance 1.8005940664569304})\n\nwe can see that :sex is more important to predict :survived then :pclass and :embark",
+ "text": "3.5 Random forrest\nNext is random forrest\n\n(def rf-model (ml/train (:train split) {:model-type :smile.classification/random-forest}))\n\n\n(def rf-prediction\n (ml/predict (:test split) rf-model))\n\n\n(loss/classification-accuracy\n (:survived (ds-cat/reverse-map-categorical-xforms (:test split)))\n (:survived (ds-cat/reverse-map-categorical-xforms rf-prediction)))\n\n\n0.7676767676767677\n\nbest so far, 71 %\nFrom the logistic regression model we can get via java Interop some model explanations, for example the variable importance.\n\n(->>\n (map\n (fn [predictor importance]\n (hash-map :predictor (-> predictor str csk/->kebab-case-keyword)\n :importance importance))\n\n (-> rf-model ml/thaw-model .formula .predictors)\n (-> rf-model ml/thaw-model .importance))\n (sort-by :importance)\n reverse)\n\n\n({:predictor :sex, :importance 44.05095738906217}\n {:predictor :pclass, :importance 9.824296073595015}\n {:predictor :embarked, :importance 1.6441426390534277})\n\nwe can see that :sex is more important to predict :survived then :pclass and :embark",
"crumbs": [
"3 Machine learning"
]
@@ -154,7 +164,7 @@
"href": "noj_book.automl.html#the-metamorph-pipeline-abstraction",
"title": "4 AutoML using metamorph pipelines",
"section": "",
- "text": "(require '[scicloj.metamorph.ml :as ml]\n '[scicloj.metamorph.core :as mm]\n '[tablecloth.api :as tc])\n\n\n\n(def titanic ml-basic/numeric-titanic-data)\n\n\n\n(def splits (first (tc/split->seq titanic)))\n\n\n(def train-ds (:train splits))\n\n\n(def test-ds (:test splits))\n\n\n\n\n(def my-pipeline\n (mm/pipeline\n (ml/model {:model-type :metamorph.ml/dummy-classifier})))\n\n\n\nmy-pipeline\n\n\n#function[clojure.core/partial/fn--5908]\n\n\n\n\n\n(def ctx-after-train\n (my-pipeline {:metamorph/data train-ds\n :metamorph/mode :fit}))\n\n\nctx-after-train\n\n\n{\n\n\n\n\n\n\n\n\n:metamorph/data\n\n\n\nGroup: 0 [711 4]:\n\n\n\n:sex\n:pclass\n:embarked\n:survived\n\n\n\n\n0.0\n3.0\n0.0\n0.0\n\n\n0.0\n3.0\n1.0\n0.0\n\n\n0.0\n3.0\n0.0\n1.0\n\n\n1.0\n3.0\n2.0\n0.0\n\n\n0.0\n1.0\n2.0\n0.0\n\n\n0.0\n3.0\n0.0\n0.0\n\n\n0.0\n2.0\n0.0\n0.0\n\n\n0.0\n3.0\n0.0\n0.0\n\n\n1.0\n3.0\n1.0\n1.0\n\n\n0.0\n3.0\n2.0\n1.0\n\n\n...\n...\n...\n...\n\n\n0.0\n3.0\n0.0\n0.0\n\n\n1.0\n1.0\n2.0\n1.0\n\n\n1.0\n3.0\n0.0\n0.0\n\n\n0.0\n1.0\n2.0\n1.0\n\n\n1.0\n2.0\n0.0\n1.0\n\n\n1.0\n2.0\n0.0\n1.0\n\n\n0.0\n2.0\n0.0\n0.0\n\n\n0.0\n2.0\n0.0\n1.0\n\n\n0.0\n3.0\n0.0\n0.0\n\n\n1.0\n3.0\n2.0\n1.0\n\n\n1.0\n2.0\n0.0\n1.0\n\n\n\n\n\n\n\n\n:metamorph/mode :fit#uuid \"a94be0fe-cf0d-4958-911b-91eb334d6b71\" {:model-data {:majority-class 1.0, :distinct-labels (0.0 1.0)}, :options {:model-type :metamorph.ml/dummy-classifier}, :id #uuid \"79d462b8-51cf-4cd8-841f-eec3177b8560\", :feature-columns [:sex :pclass :embarked], :target-columns [:survived], :target-categorical-maps {:survived #tech.v3.dataset.categorical.CategoricalMap{:lookup-table {0 0, 1 1}, :src-column :survived, :result-datatype :float64}}, :scicloj.metamorph.ml/unsupervised? nil}}\n\n\n(keys ctx-after-train)\n\n\n(:metamorph/data\n :metamorph/mode\n #uuid \"a94be0fe-cf0d-4958-911b-91eb334d6b71\")\n\n\n\n(vals ctx-after-train)\n\n\n(Group: 0 [711 4]:\n\n\n\n:sex\n:pclass\n:embarked\n:survived\n\n\n\n\n0.0\n3.0\n0.0\n0.0\n\n\n0.0\n3.0\n1.0\n0.0\n\n\n0.0\n3.0\n0.0\n1.0\n\n\n1.0\n3.0\n2.0\n0.0\n\n\n0.0\n1.0\n2.0\n0.0\n\n\n0.0\n3.0\n0.0\n0.0\n\n\n0.0\n2.0\n0.0\n0.0\n\n\n0.0\n3.0\n0.0\n0.0\n\n\n1.0\n3.0\n1.0\n1.0\n\n\n0.0\n3.0\n2.0\n1.0\n\n\n...\n...\n...\n...\n\n\n0.0\n3.0\n0.0\n0.0\n\n\n1.0\n1.0\n2.0\n1.0\n\n\n1.0\n3.0\n0.0\n0.0\n\n\n0.0\n1.0\n2.0\n1.0\n\n\n1.0\n2.0\n0.0\n1.0\n\n\n1.0\n2.0\n0.0\n1.0\n\n\n0.0\n2.0\n0.0\n0.0\n\n\n0.0\n2.0\n0.0\n1.0\n\n\n0.0\n3.0\n0.0\n0.0\n\n\n1.0\n3.0\n2.0\n1.0\n\n\n1.0\n2.0\n0.0\n1.0\n\n\n\n:fit\n{:model-data {:majority-class 1.0, :distinct-labels (0.0 1.0)},\n :options {:model-type :metamorph.ml/dummy-classifier},\n :id #uuid \"79d462b8-51cf-4cd8-841f-eec3177b8560\",\n :feature-columns [:sex :pclass :embarked],\n :target-columns [:survived],\n :target-categorical-maps\n {:survived\n {:lookup-table {0 0, 1 1},\n :src-column :survived,\n :result-datatype :float64}},\n :scicloj.metamorph.ml/unsupervised? nil}\n)\n\n\n\n\n(def ctx-after-predict\n (my-pipeline (assoc ctx-after-train\n :metamorph/mode :transform\n :metamorph/data test-ds)))\n\n\nctx-after-predict\n\n\n{\n\n\n\n\n\n\n\n\n:metamorph/data\n\n\n\n_unnamed [178 1]:\n\n\n\n:survived\n\n\n\n\n1.0\n\n\n1.0\n\n\n1.0\n\n\n1.0\n\n\n1.0\n\n\n1.0\n\n\n1.0\n\n\n1.0\n\n\n1.0\n\n\n1.0\n\n\n...\n\n\n1.0\n\n\n1.0\n\n\n1.0\n\n\n1.0\n\n\n1.0\n\n\n1.0\n\n\n1.0\n\n\n1.0\n\n\n1.0\n\n\n1.0\n\n\n1.0\n\n\n\n\n\n\n\n\n:metamorph/mode :transform\n\n\n\n\n\n\n\n\n#uuid \"a94be0fe-cf0d-4958-911b-91eb334d6b71\"\n\n\n\n{\n\n\n:feature-columns [:sex :pclass :embarked]\n\n\n:target-categorical-maps {:survived #tech.v3.dataset.categorical.CategoricalMap{:lookup-table {0 0, 1 1}, :src-column :survived, :result-datatype :float64}}\n\n\n:target-columns [:survived]\n\n\n:scicloj.metamorph.ml/unsupervised? nil\n\n\n\n\n\n\n\n\n\n:scicloj.metamorph.ml/feature-ds\n\n\n\nGroup: 0 [178 3]:\n\n\n\n:sex\n:pclass\n:embarked\n\n\n\n\n1.0\n3.0\n0.0\n\n\n0.0\n3.0\n0.0\n\n\n0.0\n3.0\n0.0\n\n\n0.0\n1.0\n0.0\n\n\n1.0\n3.0\n0.0\n\n\n0.0\n3.0\n0.0\n\n\n1.0\n3.0\n0.0\n\n\n0.0\n3.0\n0.0\n\n\n0.0\n2.0\n0.0\n\n\n1.0\n1.0\n0.0\n\n\n...\n...\n...\n\n\n1.0\n1.0\n0.0\n\n\n0.0\n3.0\n0.0\n\n\n0.0\n2.0\n0.0\n\n\n0.0\n3.0\n0.0\n\n\n0.0\n3.0\n2.0\n\n\n1.0\n1.0\n2.0\n\n\n1.0\n2.0\n0.0\n\n\n1.0\n2.0\n0.0\n\n\n1.0\n1.0\n0.0\n\n\n1.0\n1.0\n2.0\n\n\n1.0\n1.0\n2.0\n\n\n\n\n\n\n\n\n\n:model-data {:majority-class 1.0, :distinct-labels (0.0 1.0)}\n\n\n:id #uuid \"79d462b8-51cf-4cd8-841f-eec3177b8560\"\n\n\n\n\n\n\n\n\n\n:scicloj.metamorph.ml/target-ds\n\n\n\nGroup: 0 [178 1]:\n\n\n\n:survived\n\n\n\n\n0.0\n\n\n0.0\n\n\n0.0\n\n\n1.0\n\n\n1.0\n\n\n0.0\n\n\n0.0\n\n\n1.0\n\n\n0.0\n\n\n1.0\n\n\n...\n\n\n1.0\n\n\n0.0\n\n\n0.0\n\n\n0.0\n\n\n0.0\n\n\n1.0\n\n\n1.0\n\n\n1.0\n\n\n1.0\n\n\n1.0\n\n\n1.0\n\n\n\n\n\n\n\n\n\n:options {:model-type :metamorph.ml/dummy-classifier}\n\n\n}\n\n\n\n\n\n}\n\n\n\n\n(-> ctx-after-predict :metamorph/data :survived)\n\n\n#tech.v3.dataset.column<float64>[178]\n:survived\n[1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000...]",
+ "text": "(require '[scicloj.metamorph.ml :as ml]\n '[scicloj.metamorph.core :as mm]\n '[tablecloth.api :as tc])\n\n\n\n(def titanic ml-basic/numeric-titanic-data)\n\n\n\n(def splits (first (tc/split->seq titanic)))\n\n\n(def train-ds (:train splits))\n\n\n(def test-ds (:test splits))\n\n\n\n\n(def my-pipeline\n (mm/pipeline\n (ml/model {:model-type :metamorph.ml/dummy-classifier})))\n\n\n\nmy-pipeline\n\n\n#function[clojure.core/partial/fn--5908]\n\n\n\n\n\n(def ctx-after-train\n (my-pipeline {:metamorph/data train-ds\n :metamorph/mode :fit}))\n\n\nctx-after-train\n\n\n{\n\n\n\n\n\n\n\n\n:metamorph/data\n\n\n\nGroup: 0 [711 4]:\n\n\n\n:sex\n:pclass\n:embarked\n:survived\n\n\n\n\n1.0\n3.0\n1.0\n1.0\n\n\n0.0\n3.0\n0.0\n0.0\n\n\n0.0\n1.0\n2.0\n0.0\n\n\n0.0\n3.0\n0.0\n0.0\n\n\n0.0\n1.0\n0.0\n1.0\n\n\n0.0\n3.0\n1.0\n0.0\n\n\n1.0\n2.0\n0.0\n1.0\n\n\n0.0\n3.0\n0.0\n0.0\n\n\n0.0\n3.0\n1.0\n0.0\n\n\n0.0\n3.0\n1.0\n0.0\n\n\n...\n...\n...\n...\n\n\n0.0\n3.0\n0.0\n0.0\n\n\n1.0\n2.0\n0.0\n1.0\n\n\n0.0\n3.0\n0.0\n0.0\n\n\n1.0\n3.0\n2.0\n1.0\n\n\n0.0\n3.0\n1.0\n0.0\n\n\n0.0\n3.0\n2.0\n0.0\n\n\n1.0\n3.0\n0.0\n0.0\n\n\n0.0\n3.0\n0.0\n0.0\n\n\n0.0\n1.0\n0.0\n0.0\n\n\n1.0\n3.0\n2.0\n1.0\n\n\n0.0\n3.0\n0.0\n0.0\n\n\n\n\n\n\n\n\n:metamorph/mode :fit#uuid \"fb791a68-7b46-4e12-ab0c-f9ac09b143d6\" {:model-data {:majority-class 0.0, :distinct-labels (1.0 0.0)}, :options {:model-type :metamorph.ml/dummy-classifier}, :id #uuid \"9dd62816-15e5-4d7a-9878-03a838ec6329\", :feature-columns [:sex :pclass :embarked], :target-columns [:survived], :target-categorical-maps {:survived #tech.v3.dataset.categorical.CategoricalMap{:lookup-table {0 0, 1 1}, :src-column :survived, :result-datatype :float64}}, :scicloj.metamorph.ml/unsupervised? nil}}\n\n\n(keys ctx-after-train)\n\n\n(:metamorph/data\n :metamorph/mode\n #uuid \"fb791a68-7b46-4e12-ab0c-f9ac09b143d6\")\n\n\n\n(vals ctx-after-train)\n\n\n(Group: 0 [711 4]:\n\n\n\n:sex\n:pclass\n:embarked\n:survived\n\n\n\n\n1.0\n3.0\n1.0\n1.0\n\n\n0.0\n3.0\n0.0\n0.0\n\n\n0.0\n1.0\n2.0\n0.0\n\n\n0.0\n3.0\n0.0\n0.0\n\n\n0.0\n1.0\n0.0\n1.0\n\n\n0.0\n3.0\n1.0\n0.0\n\n\n1.0\n2.0\n0.0\n1.0\n\n\n0.0\n3.0\n0.0\n0.0\n\n\n0.0\n3.0\n1.0\n0.0\n\n\n0.0\n3.0\n1.0\n0.0\n\n\n...\n...\n...\n...\n\n\n0.0\n3.0\n0.0\n0.0\n\n\n1.0\n2.0\n0.0\n1.0\n\n\n0.0\n3.0\n0.0\n0.0\n\n\n1.0\n3.0\n2.0\n1.0\n\n\n0.0\n3.0\n1.0\n0.0\n\n\n0.0\n3.0\n2.0\n0.0\n\n\n1.0\n3.0\n0.0\n0.0\n\n\n0.0\n3.0\n0.0\n0.0\n\n\n0.0\n1.0\n0.0\n0.0\n\n\n1.0\n3.0\n2.0\n1.0\n\n\n0.0\n3.0\n0.0\n0.0\n\n\n\n:fit\n{:model-data {:majority-class 0.0, :distinct-labels (1.0 0.0)},\n :options {:model-type :metamorph.ml/dummy-classifier},\n :id #uuid \"9dd62816-15e5-4d7a-9878-03a838ec6329\",\n :feature-columns [:sex :pclass :embarked],\n :target-columns [:survived],\n :target-categorical-maps\n {:survived\n {:lookup-table {0 0, 1 1},\n :src-column :survived,\n :result-datatype :float64}},\n :scicloj.metamorph.ml/unsupervised? nil}\n)\n\n\n\n\n(def ctx-after-predict\n (my-pipeline (assoc ctx-after-train\n :metamorph/mode :transform\n :metamorph/data test-ds)))\n\n\nctx-after-predict\n\n\n{\n\n\n\n\n\n\n\n\n:metamorph/data\n\n\n\n_unnamed [178 1]:\n\n\n\n:survived\n\n\n\n\n0.0\n\n\n0.0\n\n\n0.0\n\n\n0.0\n\n\n0.0\n\n\n0.0\n\n\n0.0\n\n\n0.0\n\n\n0.0\n\n\n0.0\n\n\n...\n\n\n0.0\n\n\n0.0\n\n\n0.0\n\n\n0.0\n\n\n0.0\n\n\n0.0\n\n\n0.0\n\n\n0.0\n\n\n0.0\n\n\n0.0\n\n\n0.0\n\n\n\n\n\n\n\n\n:metamorph/mode :transform\n\n\n\n\n\n\n\n\n#uuid \"fb791a68-7b46-4e12-ab0c-f9ac09b143d6\"\n\n\n\n{\n\n\n:feature-columns [:sex :pclass :embarked]\n\n\n:target-categorical-maps {:survived #tech.v3.dataset.categorical.CategoricalMap{:lookup-table {0 0, 1 1}, :src-column :survived, :result-datatype :float64}}\n\n\n:target-columns [:survived]\n\n\n:scicloj.metamorph.ml/unsupervised? nil\n\n\n\n\n\n\n\n\n\n:scicloj.metamorph.ml/feature-ds\n\n\n\nGroup: 0 [178 3]:\n\n\n\n:sex\n:pclass\n:embarked\n\n\n\n\n0.0\n2.0\n0.0\n\n\n0.0\n1.0\n1.0\n\n\n0.0\n3.0\n1.0\n\n\n0.0\n3.0\n0.0\n\n\n1.0\n2.0\n0.0\n\n\n1.0\n2.0\n0.0\n\n\n0.0\n1.0\n0.0\n\n\n0.0\n3.0\n0.0\n\n\n1.0\n3.0\n0.0\n\n\n0.0\n1.0\n2.0\n\n\n...\n...\n...\n\n\n0.0\n3.0\n0.0\n\n\n1.0\n1.0\n2.0\n\n\n1.0\n3.0\n2.0\n\n\n1.0\n1.0\n0.0\n\n\n0.0\n1.0\n0.0\n\n\n1.0\n3.0\n0.0\n\n\n0.0\n3.0\n0.0\n\n\n1.0\n3.0\n0.0\n\n\n0.0\n2.0\n0.0\n\n\n0.0\n1.0\n2.0\n\n\n0.0\n3.0\n2.0\n\n\n\n\n\n\n\n\n\n:model-data {:majority-class 0.0, :distinct-labels (1.0 0.0)}\n\n\n:id #uuid \"9dd62816-15e5-4d7a-9878-03a838ec6329\"\n\n\n\n\n\n\n\n\n\n:scicloj.metamorph.ml/target-ds\n\n\n\nGroup: 0 [178 1]:\n\n\n\n:survived\n\n\n\n\n1.0\n\n\n0.0\n\n\n0.0\n\n\n0.0\n\n\n1.0\n\n\n1.0\n\n\n0.0\n\n\n0.0\n\n\n1.0\n\n\n1.0\n\n\n...\n\n\n0.0\n\n\n1.0\n\n\n0.0\n\n\n1.0\n\n\n0.0\n\n\n0.0\n\n\n0.0\n\n\n1.0\n\n\n0.0\n\n\n1.0\n\n\n0.0\n\n\n\n\n\n\n\n\n\n:options {:model-type :metamorph.ml/dummy-classifier}\n\n\n}\n\n\n\n\n\n}\n\n\n\n\n(-> ctx-after-predict :metamorph/data :survived)\n\n\n#tech.v3.dataset.column<float64>[178]\n:survived\n[0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000...]",
"crumbs": [
"4 AutoML using metamorph pipelines"
]
@@ -164,7 +174,7 @@
"href": "noj_book.automl.html#use-metamorph-pipelines-to-do-model-training-with-higher-level-api",
"title": "4 AutoML using metamorph pipelines",
"section": "4.2 Use metamorph pipelines to do model training with higher level API",
- "text": "4.2 Use metamorph pipelines to do model training with higher level API\nAs user of metamorph.ml we do not need to deal with this low-level details of how metamorph works, we have convenience functions which hide this\nThe following code will do the same as train, but return a context object, which contains the trained model, so it will execute the pipeline, and not only create it.\nIt uses a convenience function mm/fit which generates compliant context maps internally and executes the pipeline as well.\nThe ctx acts a collector of everything “learned” during :fit, mainly the trained model, but it could be as well other information learned from the data during :fit and to be applied at :transform .\n\n(def train-ctx\n (mm/fit titanic\n (ml/model {:model-type :metamorph.ml/dummy-classifier})))\n\n(The dummy-classifier model does not have a lot of state, so there is little to see)\n\ntrain-ctx\n\n\n{\n\n\n\n\n\n\n\n\n:metamorph/data\n\n\n\n_unnamed [889 4]:\n\n\n\n:sex\n:pclass\n:embarked\n:survived\n\n\n\n\n0.0\n3.0\n0.0\n0.0\n\n\n1.0\n1.0\n2.0\n1.0\n\n\n1.0\n3.0\n0.0\n1.0\n\n\n1.0\n1.0\n0.0\n1.0\n\n\n0.0\n3.0\n0.0\n0.0\n\n\n0.0\n3.0\n1.0\n0.0\n\n\n0.0\n1.0\n0.0\n0.0\n\n\n0.0\n3.0\n0.0\n0.0\n\n\n1.0\n3.0\n0.0\n1.0\n\n\n1.0\n2.0\n2.0\n1.0\n\n\n...\n...\n...\n...\n\n\n1.0\n2.0\n0.0\n1.0\n\n\n0.0\n3.0\n0.0\n0.0\n\n\n1.0\n3.0\n0.0\n0.0\n\n\n0.0\n2.0\n0.0\n0.0\n\n\n0.0\n3.0\n0.0\n0.0\n\n\n1.0\n3.0\n1.0\n0.0\n\n\n0.0\n2.0\n0.0\n0.0\n\n\n1.0\n1.0\n0.0\n1.0\n\n\n1.0\n3.0\n0.0\n0.0\n\n\n0.0\n1.0\n2.0\n1.0\n\n\n0.0\n3.0\n1.0\n0.0\n\n\n\n\n\n\n\n\n:metamorph/mode :fit#uuid \"59740c08-2901-4a4e-947c-6ca3a6882e60\" {:model-data {:majority-class 1, :distinct-labels (0.0 1.0)}, :options {:model-type :metamorph.ml/dummy-classifier}, :id #uuid \"47579ce8-ce49-4a37-a203-a6c1f8cf7d57\", :feature-columns [:sex :pclass :embarked], :target-columns [:survived], :target-categorical-maps {:survived #tech.v3.dataset.categorical.CategoricalMap{:lookup-table {0 0, 1 1}, :src-column :survived, :result-datatype :float64}}, :scicloj.metamorph.ml/unsupervised? nil}}\n\nTo show the power of pipelines, I start with doing the simplest possible pipeline, and expand then on it.\nwe can already chain train and test with usual functions:\n\n(->>\n (ml/train train-ds {:model-type :metamorph.ml/dummy-classifier})\n (ml/predict test-ds)\n :survived)\n\n\n#tech.v3.dataset.column<float64>[178]\n:survived\n[1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000...]\n\nthe same with pipelines\n\n(def pipeline\n (mm/pipeline (ml/model {:model-type :metamorph.ml/dummy-classifier})))\n\n\n(->>\n (mm/fit-pipe train-ds pipeline)\n (mm/transform-pipe test-ds pipeline)\n :metamorph/data :survived)\n\n\n#tech.v3.dataset.column<float64>[178]\n:survived\n[1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000...]",
+ "text": "4.2 Use metamorph pipelines to do model training with higher level API\nAs user of metamorph.ml we do not need to deal with this low-level details of how metamorph works, we have convenience functions which hide this\nThe following code will do the same as train, but return a context object, which contains the trained model, so it will execute the pipeline, and not only create it.\nIt uses a convenience function mm/fit which generates compliant context maps internally and executes the pipeline as well.\nThe ctx acts a collector of everything “learned” during :fit, mainly the trained model, but it could be as well other information learned from the data during :fit and to be applied at :transform .\n\n(def train-ctx\n (mm/fit titanic\n (ml/model {:model-type :metamorph.ml/dummy-classifier})))\n\n(The dummy-classifier model does not have a lot of state, so there is little to see)\n\ntrain-ctx\n\n\n{\n\n\n\n\n\n\n\n\n:metamorph/data\n\n\n\n_unnamed [889 4]:\n\n\n\n:sex\n:pclass\n:embarked\n:survived\n\n\n\n\n0.0\n3.0\n0.0\n0.0\n\n\n1.0\n1.0\n2.0\n1.0\n\n\n1.0\n3.0\n0.0\n1.0\n\n\n1.0\n1.0\n0.0\n1.0\n\n\n0.0\n3.0\n0.0\n0.0\n\n\n0.0\n3.0\n1.0\n0.0\n\n\n0.0\n1.0\n0.0\n0.0\n\n\n0.0\n3.0\n0.0\n0.0\n\n\n1.0\n3.0\n0.0\n1.0\n\n\n1.0\n2.0\n2.0\n1.0\n\n\n...\n...\n...\n...\n\n\n1.0\n2.0\n0.0\n1.0\n\n\n0.0\n3.0\n0.0\n0.0\n\n\n1.0\n3.0\n0.0\n0.0\n\n\n0.0\n2.0\n0.0\n0.0\n\n\n0.0\n3.0\n0.0\n0.0\n\n\n1.0\n3.0\n1.0\n0.0\n\n\n0.0\n2.0\n0.0\n0.0\n\n\n1.0\n1.0\n0.0\n1.0\n\n\n1.0\n3.0\n0.0\n0.0\n\n\n0.0\n1.0\n2.0\n1.0\n\n\n0.0\n3.0\n1.0\n0.0\n\n\n\n\n\n\n\n\n:metamorph/mode :fit#uuid \"2e378827-56f1-45ff-8907-3e0bd19e48b3\" {:model-data {:majority-class 1, :distinct-labels (0.0 1.0)}, :options {:model-type :metamorph.ml/dummy-classifier}, :id #uuid \"a88653e6-ad63-4bc8-b06f-4fdc3fd43e89\", :feature-columns [:sex :pclass :embarked], :target-columns [:survived], :target-categorical-maps {:survived #tech.v3.dataset.categorical.CategoricalMap{:lookup-table {0 0, 1 1}, :src-column :survived, :result-datatype :float64}}, :scicloj.metamorph.ml/unsupervised? nil}}\n\nTo show the power of pipelines, I start with doing the simplest possible pipeline, and expand then on it.\nwe can already chain train and test with usual functions:\n\n(->>\n (ml/train train-ds {:model-type :metamorph.ml/dummy-classifier})\n (ml/predict test-ds)\n :survived)\n\n\n#tech.v3.dataset.column<float64>[178]\n:survived\n[0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000...]\n\nthe same with pipelines\n\n(def pipeline\n (mm/pipeline (ml/model {:model-type :metamorph.ml/dummy-classifier})))\n\n\n(->>\n (mm/fit-pipe train-ds pipeline)\n (mm/transform-pipe test-ds pipeline)\n :metamorph/data :survived)\n\n\n#tech.v3.dataset.column<float64>[178]\n:survived\n[0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000...]",
"crumbs": [
"4 AutoML using metamorph pipelines"
]
@@ -184,7 +194,7 @@
"href": "noj_book.automl.html#finding-the-best-model-automatically",
"title": "4 AutoML using metamorph pipelines",
"section": "4.4 Finding the best model automatically",
- "text": "4.4 Finding the best model automatically\nThe advantage of the pipelines is even more visible, if we want to have configurable pipelines, and do a grid search to find optimal settings.\nthe following will find the best model across: * 5 different model classes * 6 different selections of used features * k-cross validate this with different test / train splits\n\n(defn make-pipe-fn [model-type features]\n (mm/pipeline\n ;; store the used features in ctx, so we can retrieve them at the end\n (fn [ctx]\n (assoc ctx :used-features features))\n (mm/lift tc/select-columns (conj features :survived))\n {:metamorph/id :model} (ml/model {:model-type model-type})))\n\ncreate a 5-K cross validation split of the data\n\n(def titanic-k-fold (tc/split->seq ml-basic/numeric-titanic-data :kfold {:seed 12345}))\n\nThe list of the model types we want to try:\n\n(def model-types [:metamorph.ml/dummy-classifier\n :smile.classification/random-forest\n :smile.classification/logistic-regression\n :smile.classification/decision-tree\n :smile.classification/ada-boost])\n\nThis uses models from smile only, but could be any metamorph.ml compliant model ( library sklearn-clj wraps all python sklearn models, for example)\nThe list of feature combinations to try for each model:\n\n(def feature-combinations\n [[:sex :pclass :embarked]\n [:sex]\n [:pclass :embarked]\n [:embarked]\n [:sex :embarked]\n [:sex :pclass]])\n\ngenerate 30 pipeline functions:\n\n(def pipe-fns\n (for [model-type model-types\n feature-combination feature-combinations]\n (make-pipe-fn model-type feature-combination)))\n\nExceute all pipelines for all splits in the cross-validations and return best model by classification-accuracy\n\n(def evaluation-results\n (ml/evaluate-pipelines\n pipe-fns\n titanic-k-fold\n loss/classification-accuracy\n :accuracy))\n\nBy default it returns the best mode only\n\n(make-results-ds evaluation-results)\n\n\n_unnamed [1 3]:\n\n\n\n\n\n\n\n\n:used-features\n:mean-accuracy\n:options\n\n\n\n\n[:sex :pclass :embarked]\n0.81107726\n{:model-type :smile.classification/ada-boost}\n\n\n\n\nThe key observation is here, that the metamorph pipelines allow to not only grid-search over the model hyper-parameters, but as well over arbitrary pipeline variations, like which features to include. Both get handled in the same way.\nWwe can get all results as well:\n\n(def evaluation-results-all\n (ml/evaluate-pipelines\n pipe-fns\n titanic-k-fold\n loss/classification-accuracy\n :accuracy\n {:return-best-crossvalidation-only false\n :return-best-pipeline-only false}))\n\nIn total it creates and evaluates 5 models * 6 model configurations * 5 CV = 150 models\n\n(-> evaluation-results-all flatten count)\n\n\n150\n\nWe can find the best as well by hand, it’s the first from the list, when sorted by accuracy.\n\n(-> (make-results-ds evaluation-results-all)\n (tc/unique-by)\n (tc/order-by [:mean-accuracy] :desc))\n\n\n_unnamed [30 3]:\n\n\n\n\n\n\n\n\n:used-features\n:mean-accuracy\n:options\n\n\n\n\n[:sex :pclass :embarked]\n0.81107726\n{:model-type :smile.classification/ada-boost}\n\n\n[:sex :pclass]\n0.78633911\n{:model-type :smile.classification/random-forest}\n\n\n[:sex :embarked]\n0.78633276\n{:model-type :smile.classification/ada-boost}\n\n\n[:sex]\n0.78633276\n{:model-type :smile.classification/ada-boost}\n\n\n[:sex :pclass]\n0.78633276\n{:model-type :smile.classification/logistic-regression}\n\n\n[:sex :embarked]\n0.78633276\n{:model-type :smile.classification/logistic-regression}\n\n\n[:sex]\n0.78633276\n{:model-type :smile.classification/logistic-regression}\n\n\n[:sex]\n0.78633276\n{:model-type :smile.classification/random-forest}\n\n\n[:sex :pclass :embarked]\n0.78298737\n{:model-type :smile.classification/random-forest}\n\n\n[:sex :pclass :embarked]\n0.77507776\n{:model-type :smile.classification/logistic-regression}\n\n\n…\n…\n…\n\n\n[:pclass :embarked]\n0.63783406\n{:model-type :smile.classification/random-forest}\n\n\n[:embarked]\n0.63780232\n{:model-type :smile.classification/logistic-regression}\n\n\n[:embarked]\n0.63218435\n{:model-type :smile.classification/ada-boost}\n\n\n[:embarked]\n0.61419412\n{:model-type :smile.classification/random-forest}\n\n\n[:embarked]\n0.61305783\n{:model-type :smile.classification/decision-tree}\n\n\n[:sex :pclass]\n0.38243509\n{:model-type :metamorph.ml/dummy-classifier}\n\n\n[:sex :embarked]\n0.38243509\n{:model-type :metamorph.ml/dummy-classifier}\n\n\n[:embarked]\n0.38243509\n{:model-type :metamorph.ml/dummy-classifier}\n\n\n[:pclass :embarked]\n0.38243509\n{:model-type :metamorph.ml/dummy-classifier}\n\n\n[:sex]\n0.38243509\n{:model-type :metamorph.ml/dummy-classifier}\n\n\n[:sex :pclass :embarked]\n0.38243509\n{:model-type :metamorph.ml/dummy-classifier}",
+ "text": "4.4 Finding the best model automatically\nThe advantage of the pipelines is even more visible, if we want to have configurable pipelines, and do a grid search to find optimal settings.\nthe following will find the best model across: * 5 different model classes * 6 different selections of used features * k-cross validate this with different test / train splits\n\n(defn make-pipe-fn [model-type features]\n (mm/pipeline\n ;; store the used features in ctx, so we can retrieve them at the end\n (fn [ctx]\n (assoc ctx :used-features features))\n (mm/lift tc/select-columns (conj features :survived))\n {:metamorph/id :model} (ml/model {:model-type model-type})))\n\ncreate a 5-K cross validation split of the data\n\n(def titanic-k-fold (tc/split->seq ml-basic/numeric-titanic-data :kfold {:seed 12345}))\n\nThe list of the model types we want to try:\n\n(def model-types [:metamorph.ml/dummy-classifier\n :smile.classification/random-forest\n :smile.classification/logistic-regression\n :smile.classification/decision-tree\n :smile.classification/ada-boost])\n\nThis uses models from smile only, but could be any metamorph.ml compliant model ( library sklearn-clj wraps all python sklearn models, for example)\nThe list of feature combinations to try for each model:\n\n(def feature-combinations\n [[:sex :pclass :embarked]\n [:sex]\n [:pclass :embarked]\n [:embarked]\n [:sex :embarked]\n [:sex :pclass]])\n\ngenerate 30 pipeline functions:\n\n(def pipe-fns\n (for [model-type model-types\n feature-combination feature-combinations]\n (make-pipe-fn model-type feature-combination)))\n\nExceute all pipelines for all splits in the cross-validations and return best model by classification-accuracy\n\n(def evaluation-results\n (ml/evaluate-pipelines\n pipe-fns\n titanic-k-fold\n loss/classification-accuracy\n :accuracy))\n\nBy default it returns the best mode only\n\n(make-results-ds evaluation-results)\n\n\n_unnamed [1 3]:\n\n\n\n\n\n\n\n\n:used-features\n:mean-accuracy\n:options\n\n\n\n\n[:sex :pclass :embarked]\n0.81107726\n{:model-type :smile.classification/ada-boost}\n\n\n\n\nThe key observation is here, that the metamorph pipelines allow to not only grid-search over the model hyper-parameters, but as well over arbitrary pipeline variations, like which features to include. Both get handled in the same way.\nWwe can get all results as well:\n\n(def evaluation-results-all\n (ml/evaluate-pipelines\n pipe-fns\n titanic-k-fold\n loss/classification-accuracy\n :accuracy\n {:return-best-crossvalidation-only false\n :return-best-pipeline-only false}))\n\nIn total it creates and evaluates 5 models * 6 model configurations * 5 CV = 150 models\n\n(-> evaluation-results-all flatten count)\n\n\n150\n\nWe can find the best as well by hand, it’s the first from the list, when sorted by accuracy.\n\n(-> (make-results-ds evaluation-results-all)\n (tc/unique-by)\n (tc/order-by [:mean-accuracy] :desc))\n\n\n_unnamed [30 3]:\n\n\n\n\n\n\n\n\n:used-features\n:mean-accuracy\n:options\n\n\n\n\n[:sex :pclass :embarked]\n0.81107726\n{:model-type :smile.classification/ada-boost}\n\n\n[:sex]\n0.78633276\n{:model-type :smile.classification/ada-boost}\n\n\n[:sex :pclass]\n0.78633276\n{:model-type :smile.classification/logistic-regression}\n\n\n[:sex :embarked]\n0.78633276\n{:model-type :smile.classification/logistic-regression}\n\n\n[:sex]\n0.78633276\n{:model-type :smile.classification/logistic-regression}\n\n\n[:sex]\n0.78633276\n{:model-type :smile.classification/random-forest}\n\n\n[:sex :pclass]\n0.78521551\n{:model-type :smile.classification/random-forest}\n\n\n[:sex :pclass :embarked]\n0.77849299\n{:model-type :smile.classification/random-forest}\n\n\n[:sex :pclass]\n0.77734400\n{:model-type :smile.classification/ada-boost}\n\n\n[:sex :pclass :embarked]\n0.77507776\n{:model-type :smile.classification/logistic-regression}\n\n\n…\n…\n…\n\n\n[:embarked]\n0.63780232\n{:model-type :smile.classification/logistic-regression}\n\n\n[:pclass :embarked]\n0.63670412\n{:model-type :smile.classification/random-forest}\n\n\n[:embarked]\n0.63218435\n{:model-type :smile.classification/ada-boost}\n\n\n[:embarked]\n0.61419412\n{:model-type :smile.classification/random-forest}\n\n\n[:embarked]\n0.61305783\n{:model-type :smile.classification/decision-tree}\n\n\n[:sex :pclass]\n0.38243509\n{:model-type :metamorph.ml/dummy-classifier}\n\n\n[:sex :embarked]\n0.38243509\n{:model-type :metamorph.ml/dummy-classifier}\n\n\n[:embarked]\n0.38243509\n{:model-type :metamorph.ml/dummy-classifier}\n\n\n[:pclass :embarked]\n0.38243509\n{:model-type :metamorph.ml/dummy-classifier}\n\n\n[:sex]\n0.38243509\n{:model-type :metamorph.ml/dummy-classifier}\n\n\n[:sex :pclass :embarked]\n0.38243509\n{:model-type :metamorph.ml/dummy-classifier}",
"crumbs": [
"4 AutoML using metamorph pipelines"
]
@@ -214,7 +224,7 @@
"href": "noj_book.interactions_ols.html#additive-model",
"title": "5 Ordinary least squares with interactions",
"section": "",
- "text": "(def additive-pipeline\n (mm/pipeline\n {:metamorph/id :model}\n (ml/model {:model-type :smile.regression/ordinary-least-square})))\n\n\n\n(def evaluations\n (ml/evaluate-pipelines\n [additive-pipeline]\n (tc/split->seq preprocessed-data :holdout)\n loss/rmse\n :loss\n {:other-metrices [{:name :r2\n :metric-fn fmstats/r2-determination}]}))\n\n\n\n(-> evaluations flatten first :fit-ctx :model ml/thaw-model)\n\n\nLinear Model:\n\nResiduals:\n Min 1Q Median 3Q Max\n -10.4530 -1.1270 0.4041 1.5434 3.5116\n\nCoefficients:\n Estimate Std. Error t value Pr(>|t|)\nIntercept 3.3370 0.4750 7.0255 0.0000 ***\nyoutube 0.0458 0.0018 25.4854 0.0000 ***\nfacebook 0.1893 0.0105 18.0913 0.0000 ***\n---------------------------------------------------------------------\nSignificance codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1\n\nResidual standard error: 2.1938 on 130 degrees of freedom\nMultiple R-squared: 0.8818, Adjusted R-squared: 0.8800\nF-statistic: 485.1281 on 3 and 130 DF, p-value: 5.119e-61\n\n\n\n\n(-> evaluations flatten first :test-transform :metric)\n\n\n1.637953126356843\n\n\n\n(-> evaluations flatten first :test-transform :other-metrices first :metric)\n\n\n0.9322495403155694",
+ "text": "(def additive-pipeline\n (mm/pipeline\n {:metamorph/id :model}\n (ml/model {:model-type :smile.regression/ordinary-least-square})))\n\n\n\n(def evaluations\n (ml/evaluate-pipelines\n [additive-pipeline]\n (tc/split->seq preprocessed-data :holdout)\n loss/rmse\n :loss\n {:other-metrices [{:name :r2\n :metric-fn fmstats/r2-determination}]}))\n\n\n\n(-> evaluations flatten first :fit-ctx :model ml/thaw-model)\n\n\nLinear Model:\n\nResiduals:\n Min 1Q Median 3Q Max\n -7.0837 -0.7686 0.2551 1.3391 3.1244\n\nCoefficients:\n Estimate Std. Error t value Pr(>|t|)\nIntercept 4.0113 0.4016 9.9873 0.0000 ***\nyoutube 0.0432 0.0016 26.2031 0.0000 ***\nfacebook 0.1919 0.0092 20.8554 0.0000 ***\n---------------------------------------------------------------------\nSignificance codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1\n\nResidual standard error: 1.9175 on 130 degrees of freedom\nMultiple R-squared: 0.9066, Adjusted R-squared: 0.9051\nF-statistic: 630.7411 on 3 and 130 DF, p-value: 1.203e-67\n\n\n\n\n(-> evaluations flatten first :test-transform :metric)\n\n\n2.265017256746244\n\n\n\n(-> evaluations flatten first :test-transform :other-metrices first :metric)\n\n\n0.8844252104035826",
"crumbs": [
"5 Ordinary least squares with interactions"
]
@@ -224,7 +234,7 @@
"href": "noj_book.interactions_ols.html#interaction-effects",
"title": "5 Ordinary least squares with interactions",
"section": "5.2 Interaction effects",
- "text": "5.2 Interaction effects\nNow we add interaction effects to it, resulting in this model equation: \\[sales = b0 + b1 * youtube + b2 * facebook + b3 * (youtube * facebook)\\]\n\n(def pipe-interaction\n (mm/pipeline\n (tcpipe/add-column :youtube*facebook (fn [ds] (fun/* (ds :youtube) (ds :facebook))))\n {:metamorph/id :model}(ml/model {:model-type :smile.regression/ordinary-least-square})))\n\nAgain we evaluate the model,\n\n(def evaluations\n (ml/evaluate-pipelines\n [pipe-interaction]\n (tc/split->seq preprocessed-data :holdout)\n loss/rmse\n :loss\n {:other-metrices [{:name :r2\n :metric-fn fmstats/r2-determination}]}))\n\nand print it and the performance metrices:\n\n(-> evaluations flatten first :fit-ctx :model ml/thaw-model)\n\n\nLinear Model:\n\nResiduals:\n Min 1Q Median 3Q Max\n -7.3257 -0.4254 0.2263 0.7205 1.6955\n\nCoefficients:\n Estimate Std. Error t value Pr(>|t|)\nIntercept 8.3738 0.3887 21.5428 0.0000 ***\nyoutube 0.0180 0.0019 9.3218 0.0000 ***\nfacebook 0.0172 0.0112 1.5405 0.1259 \nyoutube*facebook 0.0009 0.0001 17.4617 0.0000 ***\n---------------------------------------------------------------------\nSignificance codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1\n\nResidual standard error: 1.1550 on 129 degrees of freedom\nMultiple R-squared: 0.9683, Adjusted R-squared: 0.9675\nF-statistic: 1312.8955 on 4 and 129 DF, p-value: 1.919e-96\n\nAs the multiplcation of youtube*facebook is as well statistically relevant, it suggests that there is indeed an interaction between these 2 predictor variables youtube and facebook.\n\\(RMSE\\)\n\n(-> evaluations flatten first :test-transform :metric)\n\n\n1.1052884033088548\n\n\\(R^2\\)\n\n(-> evaluations flatten first :test-transform :other-metrices first :metric)\n\n\n0.9659333629686162\n\n\\(RMSE\\) and \\(R^2\\) of the intercation model are sligtly better.\nThese results suggest that the model with the interaction term is better than the model that contains only main effects. So, for this specific data, we should go for the model with the interaction model.\n\nsource: notebooks/noj_book/interactions_ols.clj",
+ "text": "5.2 Interaction effects\nNow we add interaction effects to it, resulting in this model equation: \\[sales = b0 + b1 * youtube + b2 * facebook + b3 * (youtube * facebook)\\]\n\n(def pipe-interaction\n (mm/pipeline\n (tcpipe/add-column :youtube*facebook (fn [ds] (fun/* (ds :youtube) (ds :facebook))))\n {:metamorph/id :model}(ml/model {:model-type :smile.regression/ordinary-least-square})))\n\nAgain we evaluate the model,\n\n(def evaluations\n (ml/evaluate-pipelines\n [pipe-interaction]\n (tc/split->seq preprocessed-data :holdout)\n loss/rmse\n :loss\n {:other-metrices [{:name :r2\n :metric-fn fmstats/r2-determination}]}))\n\nand print it and the performance metrices:\n\n(-> evaluations flatten first :fit-ctx :model ml/thaw-model)\n\n\nLinear Model:\n\nResiduals:\n Min 1Q Median 3Q Max\n -7.2707 -0.4485 0.2310 0.7650 1.7510\n\nCoefficients:\n Estimate Std. Error t value Pr(>|t|)\nIntercept 8.1095 0.3905 20.7660 0.0000 ***\nyoutube 0.0194 0.0020 9.6959 0.0000 ***\nfacebook 0.0216 0.0114 1.8930 0.0606 .\nyoutube*facebook 0.0009 0.0001 15.9876 0.0000 ***\n---------------------------------------------------------------------\nSignificance codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1\n\nResidual standard error: 1.2222 on 129 degrees of freedom\nMultiple R-squared: 0.9615, Adjusted R-squared: 0.9607\nF-statistic: 1075.2548 on 4 and 129 DF, p-value: 4.777e-91\n\nAs the multiplcation of youtube*facebook is as well statistically relevant, it suggests that there is indeed an interaction between these 2 predictor variables youtube and facebook.\n\\(RMSE\\)\n\n(-> evaluations flatten first :test-transform :metric)\n\n\n0.9588610422624989\n\n\\(R^2\\)\n\n(-> evaluations flatten first :test-transform :other-metrices first :metric)\n\n\n0.977610159246305\n\n\\(RMSE\\) and \\(R^2\\) of the intercation model are sligtly better.\nThese results suggest that the model with the interaction term is better than the model that contains only main effects. So, for this specific data, we should go for the model with the interaction model.\n\nsource: notebooks/noj_book/interactions_ols.clj",
"crumbs": [
"5 Ordinary least squares with interactions"
]
@@ -294,7 +304,7 @@
"href": "noj_book.statistics.html#multivariate-regression",
"title": "8 Statistics (experimental 🛠)",
"section": "8.3 Multivariate regression",
- "text": "8.3 Multivariate regression\nThe stats/regression-model function computes a regressiom model (using scicloj.ml) and adds some relevant information such as the R^2 measure.\n\n(-> iris\n (stats/regression-model\n :sepal-length\n [:sepal-width :petal-length :petal-width]\n {:model-type :smile.regression/elastic-net})\n (dissoc :model-data))\n\n\n{:feature-columns [:sepal-width :petal-length :petal-width],\n :target-columns [:sepal-length],\n :explained #function[clojure.lang.AFunction/1],\n :R2 0.8582120394596505,\n :id #uuid \"09969a91-5e57-4ee3-aea0-d3d9bf24a215\",\n :predictions #tech.v3.dataset.column<float64>[150]\n:sepal-length\n[5.022, 4.724, 4.775, 4.851, 5.081, 5.360, 4.911, 5.030, 4.664, 4.903, 5.209, 5.098, 4.775, 4.572, 5.184, 5.522, 5.089, 4.970, 5.352, 5.217...],\n :predict\n #function[scicloj.noj.v1.stats/regression-model/predict--63683],\n :options {:model-type :smile.regression/elastic-net}}\n\n\n(-> iris\n (stats/regression-model\n :sepal-length\n [:sepal-width :petal-length :petal-width]\n {:model-type :smile.regression/ordinary-least-square})\n (dissoc :model-data))\n\n\n{:feature-columns [:sepal-width :petal-length :petal-width],\n :target-columns [:sepal-length],\n :explained #function[clojure.lang.AFunction/1],\n :R2 0.8586117200663171,\n :id #uuid \"18236b22-7811-472d-97cb-72146a8e1c5a\",\n :predictions #tech.v3.dataset.column<float64>[150]\n:sepal-length\n[5.015, 4.690, 4.749, 4.826, 5.080, 5.377, 4.895, 5.021, 4.625, 4.882, 5.216, 5.092, 4.746, 4.533, 5.199, 5.561, 5.094, 4.960, 5.368, 5.226...],\n :predict\n #function[scicloj.noj.v1.stats/regression-model/predict--63683],\n :options {:model-type :smile.regression/ordinary-least-square}}\n\nThe stats/linear-regression-model convenience function uses specifically the :smile.regression/ordinary-least-square model type.\n\n(-> iris\n (stats/linear-regression-model\n :sepal-length\n [:sepal-width :petal-length :petal-width])\n (dissoc :model-data))\n\n\n{:feature-columns [:sepal-width :petal-length :petal-width],\n :target-columns [:sepal-length],\n :explained #function[clojure.lang.AFunction/1],\n :R2 0.8586117200663171,\n :id #uuid \"a0a398ca-862a-4568-892c-4558ceb9bafb\",\n :predictions #tech.v3.dataset.column<float64>[150]\n:sepal-length\n[5.015, 4.690, 4.749, 4.826, 5.080, 5.377, 4.895, 5.021, 4.625, 4.882, 5.216, 5.092, 4.746, 4.533, 5.199, 5.561, 5.094, 4.960, 5.368, 5.226...],\n :predict\n #function[scicloj.noj.v1.stats/regression-model/predict--63683],\n :options {:model-type :smile.regression/ordinary-least-square}}",
+ "text": "8.3 Multivariate regression\nThe stats/regression-model function computes a regressiom model (using scicloj.ml) and adds some relevant information such as the R^2 measure.\n\n(-> iris\n (stats/regression-model\n :sepal-length\n [:sepal-width :petal-length :petal-width]\n {:model-type :smile.regression/elastic-net})\n (dissoc :model-data))\n\n\n{:feature-columns [:sepal-width :petal-length :petal-width],\n :target-columns [:sepal-length],\n :explained #function[clojure.lang.AFunction/1],\n :R2 0.8582120394596505,\n :id #uuid \"4d455060-963a-4792-a46d-8680d97643c8\",\n :predictions #tech.v3.dataset.column<float64>[150]\n:sepal-length\n[5.022, 4.724, 4.775, 4.851, 5.081, 5.360, 4.911, 5.030, 4.664, 4.903, 5.209, 5.098, 4.775, 4.572, 5.184, 5.522, 5.089, 4.970, 5.352, 5.217...],\n :predict\n #function[scicloj.noj.v1.stats/regression-model/predict--60541],\n :options {:model-type :smile.regression/elastic-net}}\n\n\n(-> iris\n (stats/regression-model\n :sepal-length\n [:sepal-width :petal-length :petal-width]\n {:model-type :smile.regression/ordinary-least-square})\n (dissoc :model-data))\n\n\n{:feature-columns [:sepal-width :petal-length :petal-width],\n :target-columns [:sepal-length],\n :explained #function[clojure.lang.AFunction/1],\n :R2 0.8586117200663171,\n :id #uuid \"7e2014b4-28fc-4147-abeb-73b06bd33c87\",\n :predictions #tech.v3.dataset.column<float64>[150]\n:sepal-length\n[5.015, 4.690, 4.749, 4.826, 5.080, 5.377, 4.895, 5.021, 4.625, 4.882, 5.216, 5.092, 4.746, 4.533, 5.199, 5.561, 5.094, 4.960, 5.368, 5.226...],\n :predict\n #function[scicloj.noj.v1.stats/regression-model/predict--60541],\n :options {:model-type :smile.regression/ordinary-least-square}}\n\nThe stats/linear-regression-model convenience function uses specifically the :smile.regression/ordinary-least-square model type.\n\n(-> iris\n (stats/linear-regression-model\n :sepal-length\n [:sepal-width :petal-length :petal-width])\n (dissoc :model-data))\n\n\n{:feature-columns [:sepal-width :petal-length :petal-width],\n :target-columns [:sepal-length],\n :explained #function[clojure.lang.AFunction/1],\n :R2 0.8586117200663171,\n :id #uuid \"75e25455-6a0e-4174-9ac6-24ba37c19d88\",\n :predictions #tech.v3.dataset.column<float64>[150]\n:sepal-length\n[5.015, 4.690, 4.749, 4.826, 5.080, 5.377, 4.895, 5.021, 4.625, 4.882, 5.216, 5.092, 4.746, 4.533, 5.199, 5.561, 5.094, 4.960, 5.368, 5.226...],\n :predict\n #function[scicloj.noj.v1.stats/regression-model/predict--60541],\n :options {:model-type :smile.regression/ordinary-least-square}}",
"crumbs": [
"8 Statistics (experimental 🛠)"
]
@@ -304,7 +314,7 @@
"href": "noj_book.statistics.html#adding-regression-predictions-to-a-dataset",
"title": "8 Statistics (experimental 🛠)",
"section": "8.4 Adding regression predictions to a dataset",
- "text": "8.4 Adding regression predictions to a dataset\nThe stats/add-predictions function models a target column using feature columns, adds a new prediction column with the model predictions.\n\n(-> iris\n (stats/add-predictions\n :sepal-length\n [:sepal-width :petal-length :petal-width]\n {:model-type :smile.regression/ordinary-least-square}))\n\n\nhttps://vincentarelbundock.github.io/Rdatasets/csv/datasets/iris.csv [150 7]:\n\n\n\n\n\n\n\n\n\n\n\n\n:rownames\n:sepal-length\n:sepal-width\n:petal-length\n:petal-width\n:species\n:sepal-length-prediction\n\n\n\n\n1\n5.1\n3.5\n1.4\n0.2\nsetosa\n5.01541576\n\n\n2\n4.9\n3.0\n1.4\n0.2\nsetosa\n4.68999718\n\n\n3\n4.7\n3.2\n1.3\n0.2\nsetosa\n4.74925142\n\n\n4\n4.6\n3.1\n1.5\n0.2\nsetosa\n4.82599409\n\n\n5\n5.0\n3.6\n1.4\n0.2\nsetosa\n5.08049948\n\n\n6\n5.4\n3.9\n1.7\n0.4\nsetosa\n5.37719368\n\n\n7\n4.6\n3.4\n1.4\n0.3\nsetosa\n4.89468378\n\n\n8\n5.0\n3.4\n1.5\n0.2\nsetosa\n5.02124524\n\n\n9\n4.4\n2.9\n1.4\n0.2\nsetosa\n4.62491347\n\n\n10\n4.9\n3.1\n1.5\n0.1\nsetosa\n4.88164236\n\n\n…\n…\n…\n…\n…\n…\n…\n\n\n140\n6.9\n3.1\n5.4\n2.1\nvirginica\n6.53429168\n\n\n141\n6.7\n3.1\n5.6\n2.4\nvirginica\n6.50917327\n\n\n142\n6.9\n3.1\n5.1\n2.3\nvirginica\n6.21025556\n\n\n143\n5.8\n2.7\n5.1\n1.9\nvirginica\n6.17251376\n\n\n144\n6.8\n3.2\n5.9\n2.3\nvirginica\n6.84264484\n\n\n145\n6.7\n3.3\n5.7\n2.5\nvirginica\n6.65460564\n\n\n146\n6.7\n3.0\n5.2\n2.3\nvirginica\n6.21608504\n\n\n147\n6.3\n2.5\n5.0\n1.9\nvirginica\n5.97143313\n\n\n148\n6.5\n3.0\n5.2\n2.0\nvirginica\n6.38302984\n\n\n149\n6.2\n3.4\n5.4\n2.3\nvirginica\n6.61824630\n\n\n150\n5.9\n3.0\n5.1\n1.8\nvirginica\n6.42341317\n\n\n\n\nIt attaches the model’s information to the metadata of that new column.\n\n(-> iris\n (stats/add-predictions\n :sepal-length\n [:sepal-width :petal-length :petal-width]\n {:model-type :smile.regression/ordinary-least-square})\n :sepal-length-prediction\n meta\n (update :model\n dissoc :model-data :predict :predictions))\n\n\n{:name :sepal-length-prediction,\n :datatype :float64,\n :n-elems 150,\n :column-type :prediction,\n :model\n {:feature-columns [:sepal-width :petal-length :petal-width],\n :target-columns [:sepal-length],\n :explained #function[clojure.lang.AFunction/1],\n :R2 0.8586117200663171,\n :id #uuid \"ea4f1683-8c46-44b3-bec2-02f19d9fb7c7\",\n :options {:model-type :smile.regression/ordinary-least-square}}}",
+ "text": "8.4 Adding regression predictions to a dataset\nThe stats/add-predictions function models a target column using feature columns, adds a new prediction column with the model predictions.\n\n(-> iris\n (stats/add-predictions\n :sepal-length\n [:sepal-width :petal-length :petal-width]\n {:model-type :smile.regression/ordinary-least-square}))\n\n\nhttps://vincentarelbundock.github.io/Rdatasets/csv/datasets/iris.csv [150 7]:\n\n\n\n\n\n\n\n\n\n\n\n\n:rownames\n:sepal-length\n:sepal-width\n:petal-length\n:petal-width\n:species\n:sepal-length-prediction\n\n\n\n\n1\n5.1\n3.5\n1.4\n0.2\nsetosa\n5.01541576\n\n\n2\n4.9\n3.0\n1.4\n0.2\nsetosa\n4.68999718\n\n\n3\n4.7\n3.2\n1.3\n0.2\nsetosa\n4.74925142\n\n\n4\n4.6\n3.1\n1.5\n0.2\nsetosa\n4.82599409\n\n\n5\n5.0\n3.6\n1.4\n0.2\nsetosa\n5.08049948\n\n\n6\n5.4\n3.9\n1.7\n0.4\nsetosa\n5.37719368\n\n\n7\n4.6\n3.4\n1.4\n0.3\nsetosa\n4.89468378\n\n\n8\n5.0\n3.4\n1.5\n0.2\nsetosa\n5.02124524\n\n\n9\n4.4\n2.9\n1.4\n0.2\nsetosa\n4.62491347\n\n\n10\n4.9\n3.1\n1.5\n0.1\nsetosa\n4.88164236\n\n\n…\n…\n…\n…\n…\n…\n…\n\n\n140\n6.9\n3.1\n5.4\n2.1\nvirginica\n6.53429168\n\n\n141\n6.7\n3.1\n5.6\n2.4\nvirginica\n6.50917327\n\n\n142\n6.9\n3.1\n5.1\n2.3\nvirginica\n6.21025556\n\n\n143\n5.8\n2.7\n5.1\n1.9\nvirginica\n6.17251376\n\n\n144\n6.8\n3.2\n5.9\n2.3\nvirginica\n6.84264484\n\n\n145\n6.7\n3.3\n5.7\n2.5\nvirginica\n6.65460564\n\n\n146\n6.7\n3.0\n5.2\n2.3\nvirginica\n6.21608504\n\n\n147\n6.3\n2.5\n5.0\n1.9\nvirginica\n5.97143313\n\n\n148\n6.5\n3.0\n5.2\n2.0\nvirginica\n6.38302984\n\n\n149\n6.2\n3.4\n5.4\n2.3\nvirginica\n6.61824630\n\n\n150\n5.9\n3.0\n5.1\n1.8\nvirginica\n6.42341317\n\n\n\n\nIt attaches the model’s information to the metadata of that new column.\n\n(-> iris\n (stats/add-predictions\n :sepal-length\n [:sepal-width :petal-length :petal-width]\n {:model-type :smile.regression/ordinary-least-square})\n :sepal-length-prediction\n meta\n (update :model\n dissoc :model-data :predict :predictions))\n\n\n{:name :sepal-length-prediction,\n :datatype :float64,\n :n-elems 150,\n :column-type :prediction,\n :model\n {:feature-columns [:sepal-width :petal-length :petal-width],\n :target-columns [:sepal-length],\n :explained #function[clojure.lang.AFunction/1],\n :R2 0.8586117200663171,\n :id #uuid \"7650009a-4dc7-4aa4-9886-738257925fb4\",\n :options {:model-type :smile.regression/ordinary-least-square}}}",
"crumbs": [
"8 Statistics (experimental 🛠)"
]
@@ -314,7 +324,7 @@
"href": "noj_book.statistics.html#histograms",
"title": "8 Statistics (experimental 🛠)",
"section": "8.5 Histograms",
- "text": "8.5 Histograms\nThe stats/histogram function computes the necessary data to plot a histogram.\n\n(-> (repeatedly 99 rand)\n (stats/histogram {:bin-count 5}))\n\n\n_unnamed [5 3]:\n\n\n\n:count\n:left\n:right\n\n\n\n\n28\n0.00802265\n0.20465354\n\n\n18\n0.20465354\n0.40128444\n\n\n17\n0.40128444\n0.59791533\n\n\n19\n0.59791533\n0.79454622\n\n\n17\n0.79454622\n0.99117712\n\n\n\n\n\nsource: notebooks/noj_book/statistics.clj",
+ "text": "8.5 Histograms\nThe stats/histogram function computes the necessary data to plot a histogram.\n\n(-> (repeatedly 99 rand)\n (stats/histogram {:bin-count 5}))\n\n\n_unnamed [5 3]:\n\n\n\n:count\n:left\n:right\n\n\n\n\n20\n0.00050999\n0.19394683\n\n\n14\n0.19394683\n0.38738366\n\n\n19\n0.38738366\n0.58082049\n\n\n21\n0.58082049\n0.77425733\n\n\n25\n0.77425733\n0.96769416\n\n\n\n\n\nsource: notebooks/noj_book/statistics.clj",
"crumbs": [
"8 Statistics (experimental 🛠)"
]
@@ -334,7 +344,7 @@
"href": "noj_book.visualization.html#visualizing-datases-with-hanami",
"title": "9 Visualization (experimental 🛠)",
"section": "",
- "text": "(def random-walk\n (let [n 20]\n (-> {:x (range n)\n :y (->> (repeatedly n #(- (rand) 0.5))\n (reductions +))}\n tc/dataset)))\n\n\n9.1.1 A simple plot\nWe can plot a Tablecloth datasete using a Hanami template:\n\n(-> random-walk\n (vis.hanami/plot ht/point-chart\n {:MSIZE 200}))\n\n\nLet us look inside the resulting vega-lite space. We can see the dataset is included as CSV:\n\n(-> random-walk\n (vis.hanami/plot ht/point-chart\n {:MSIZE 200})\n kind/pprint)\n\n\n{:encoding\n {:y {:field \"y\", :type \"quantitative\"},\n :x {:field \"x\", :type \"quantitative\"}},\n :usermeta {:embedOptions {:renderer :svg}},\n :mark {:type \"circle\", :size 200, :tooltip true},\n :width 400,\n :background \"floralwhite\",\n :height 300,\n :data\n {:values\n \"x,y\\n0,-0.004207939587986709\\n1,-0.055789133080412334\\n2,-0.2768589633756995\\n3,-0.22201979781606618\\n4,0.24555568601114297\\n5,0.44839725879571757\\n6,0.359213993628024\\n7,0.7215768669433458\\n8,0.358897975524867\\n9,-0.12917456247823011\\n10,-0.26944984279344497\\n11,-0.6118165371009159\\n12,-1.111534335097509\\n13,-1.010186411979333\\n14,-1.3058657472829194\\n15,-1.5044117735506326\\n16,-1.6613913902571045\\n17,-1.278309667469086\\n18,-1.5370082224387598\\n19,-1.3880203897513188\\n\",\n :format {:type \"csv\"}}}\n\n\n\n9.1.2 More examples\n\n(-> datasets/mtcars\n (vis.hanami/plot ht/boxplot-chart\n {:X :gear\n :XTYPE :nominal\n :Y :mpg}))\n\n\n\n(-> datasets/iris\n (vis.hanami/plot ht/rule-chart\n {:X :sepal-width\n :Y :sepal-length\n :X2 :petal-width\n :Y2 :petal-length\n :OPACITY 0.2\n :SIZE 3\n :COLOR \"species\"}))\n\n\n\n\n9.1.3 Grouped datasets\nGrouped datasets are handled automatically with a table view.\n\n(-> datasets/iris\n (tc/group-by [:species])\n (vis.hanami/plot ht/rule-chart\n {:X :sepal-width\n :Y :sepal-length\n :X2 :petal-width\n :Y2 :petal-length\n :OPACITY 0.2\n :SIZE 3}))\n\n\n\n\n\n\n\n\n\n\n\nspecies\nplot\n\n\n\n\nsetosa\n\n\n\n\n\nversicolor\n\n\n\n\n\nvirginica\n\n\n\n\n\n\n\n\n\n\n\n9.1.4 Layers\n\n(-> random-walk\n (vis.hanami/layers\n {:TITLE \"points and a line\"}\n [(vis.hanami/plot nil\n ht/point-chart\n {:MSIZE 400})\n (vis.hanami/plot nil\n ht/line-chart\n {:MSIZE 4\n :MCOLOR \"brown\"})]))\n\n\nAlternatively:\n\n(-> random-walk\n (vis.hanami/combined-plot\n ht/layer-chart\n {:TITLE \"points and a line\"}\n :LAYER [[ht/point-chart\n {:MSIZE 400}]\n [ht/line-chart\n {:MSIZE 4\n :MCOLOR \"brown\"}]]))\n\n\n\n\n9.1.5 Concatenation\nVertical\n\n(-> random-walk\n (vis.hanami/vconcat\n {}\n [(vis.hanami/plot nil\n ht/point-chart\n {:MSIZE 400\n :HEIGHT 100\n :WIDTH 100})\n (vis.hanami/plot nil\n ht/line-chart\n {:MSIZE 4\n :MCOLOR \"brown\"\n :HEIGHT 100\n :WIDTH 100})]))\n\n\nAlternatively:\n\n(-> random-walk\n (vis.hanami/combined-plot\n ht/vconcat-chart\n {:HEIGHT 100\n :WIDTH 100}\n :VCONCAT [[ht/point-chart\n {:MSIZE 400}]\n [ht/line-chart\n {:MSIZE 4\n :MCOLOR \"brown\"}]]))\n\n\nHorizontal\n\n(-> random-walk\n (vis.hanami/hconcat\n {}\n [(vis.hanami/plot nil\n ht/point-chart\n {:MSIZE 400\n :HEIGHT 100\n :WIDTH 100})\n (vis.hanami/plot nil\n ht/line-chart\n {:MSIZE 4\n :MCOLOR \"brown\"\n :HEIGHT 100\n :WIDTH 100})]))\n\n\nAlternatively:\n\n(-> random-walk\n (vis.hanami/combined-plot\n ht/hconcat-chart\n {:HEIGHT 100\n :WIDTH 100}\n :HCONCAT [[ht/point-chart\n {:MSIZE 400}]\n [ht/line-chart\n {:MSIZE 4\n :MCOLOR \"brown\"}]]))\n\n\n\n:bye\n\n\n:bye\n\n\nsource: notebooks/noj_book/visualization.clj",
+ "text": "(def random-walk\n (let [n 20]\n (-> {:x (range n)\n :y (->> (repeatedly n #(- (rand) 0.5))\n (reductions +))}\n tc/dataset)))\n\n\n9.1.1 A simple plot\nWe can plot a Tablecloth datasete using a Hanami template:\n\n(-> random-walk\n (vis.hanami/plot ht/point-chart\n {:MSIZE 200}))\n\n\nLet us look inside the resulting vega-lite space. We can see the dataset is included as CSV:\n\n(-> random-walk\n (vis.hanami/plot ht/point-chart\n {:MSIZE 200})\n kind/pprint)\n\n\n{:encoding\n {:y {:field \"y\", :type \"quantitative\"},\n :x {:field \"x\", :type \"quantitative\"}},\n :usermeta {:embedOptions {:renderer :svg}},\n :mark {:type \"circle\", :size 200, :tooltip true},\n :width 400,\n :background \"floralwhite\",\n :height 300,\n :data\n {:values\n \"x,y\\n0,-0.1325648359179058\\n1,-0.27990686413394583\\n2,-0.11718484436161825\\n3,-0.2462438952131002\\n4,-0.7000742338173375\\n5,-0.8118999614681428\\n6,-0.34004919446904114\\n7,-0.45753462191489835\\n8,-0.09561111731129412\\n9,0.3586051405571238\\n10,0.6155968621550317\\n11,0.8444353375346749\\n12,0.5098834495225566\\n13,0.653697725279412\\n14,0.5704985315016169\\n15,0.51532920679822\\n16,0.02004226065388337\\n17,-0.11893807448086313\\n18,0.18608707238769473\\n19,0.3880074626880815\\n\",\n :format {:type \"csv\"}}}\n\n\n\n9.1.2 More examples\n\n(-> datasets/mtcars\n (vis.hanami/plot ht/boxplot-chart\n {:X :gear\n :XTYPE :nominal\n :Y :mpg}))\n\n\n\n(-> datasets/iris\n (vis.hanami/plot ht/rule-chart\n {:X :sepal-width\n :Y :sepal-length\n :X2 :petal-width\n :Y2 :petal-length\n :OPACITY 0.2\n :SIZE 3\n :COLOR \"species\"}))\n\n\n\n\n9.1.3 Grouped datasets\nGrouped datasets are handled automatically with a table view.\n\n(-> datasets/iris\n (tc/group-by [:species])\n (vis.hanami/plot ht/rule-chart\n {:X :sepal-width\n :Y :sepal-length\n :X2 :petal-width\n :Y2 :petal-length\n :OPACITY 0.2\n :SIZE 3}))\n\n\n\n\n\n\n\n\n\n\n\nspecies\nplot\n\n\n\n\nsetosa\n\n\n\n\n\nversicolor\n\n\n\n\n\nvirginica\n\n\n\n\n\n\n\n\n\n\n\n9.1.4 Layers\n\n(-> random-walk\n (vis.hanami/layers\n {:TITLE \"points and a line\"}\n [(vis.hanami/plot nil\n ht/point-chart\n {:MSIZE 400})\n (vis.hanami/plot nil\n ht/line-chart\n {:MSIZE 4\n :MCOLOR \"brown\"})]))\n\n\nAlternatively:\n\n(-> random-walk\n (vis.hanami/combined-plot\n ht/layer-chart\n {:TITLE \"points and a line\"}\n :LAYER [[ht/point-chart\n {:MSIZE 400}]\n [ht/line-chart\n {:MSIZE 4\n :MCOLOR \"brown\"}]]))\n\n\n\n\n9.1.5 Concatenation\nVertical\n\n(-> random-walk\n (vis.hanami/vconcat\n {}\n [(vis.hanami/plot nil\n ht/point-chart\n {:MSIZE 400\n :HEIGHT 100\n :WIDTH 100})\n (vis.hanami/plot nil\n ht/line-chart\n {:MSIZE 4\n :MCOLOR \"brown\"\n :HEIGHT 100\n :WIDTH 100})]))\n\n\nAlternatively:\n\n(-> random-walk\n (vis.hanami/combined-plot\n ht/vconcat-chart\n {:HEIGHT 100\n :WIDTH 100}\n :VCONCAT [[ht/point-chart\n {:MSIZE 400}]\n [ht/line-chart\n {:MSIZE 4\n :MCOLOR \"brown\"}]]))\n\n\nHorizontal\n\n(-> random-walk\n (vis.hanami/hconcat\n {}\n [(vis.hanami/plot nil\n ht/point-chart\n {:MSIZE 400\n :HEIGHT 100\n :WIDTH 100})\n (vis.hanami/plot nil\n ht/line-chart\n {:MSIZE 4\n :MCOLOR \"brown\"\n :HEIGHT 100\n :WIDTH 100})]))\n\n\nAlternatively:\n\n(-> random-walk\n (vis.hanami/combined-plot\n ht/hconcat-chart\n {:HEIGHT 100\n :WIDTH 100}\n :HCONCAT [[ht/point-chart\n {:MSIZE 400}]\n [ht/line-chart\n {:MSIZE 4\n :MCOLOR \"brown\"}]]))\n\n\n\n:bye\n\n\n:bye\n\n\nsource: notebooks/noj_book/visualization.clj",
"crumbs": [
"9 Visualization (experimental 🛠)"
]