From 58e578d89dd0eec962f97f3552e11b4daab9e0be Mon Sep 17 00:00:00 2001 From: daslu Date: Sun, 29 Sep 2024 10:22:23 +0000 Subject: [PATCH 1/3] Apply automatic changes --- docs/index.html | 6 +- docs/noj_book.automl.html | 172 +++++++++--------- ...book.visualizing_correlation_matrices.html | 2 +- docs/search.json | 16 +- 4 files changed, 98 insertions(+), 98 deletions(-) diff --git a/docs/index.html b/docs/index.html index 0a50041..819a4f9 100644 --- a/docs/index.html +++ b/docs/index.html @@ -203,7 +203,7 @@

Table of contents

@@ -259,8 +259,8 @@

1 Preface

  • Keep documenting core ideas of the underlying librares and ways to combine them in typical workflows.

  • Keep making the docs generate automatic tests using kindly/check.

  • -
    -

    1.1 Existing chapters in this book:

    +
    +

    1.1 Chapters of this book:

    • Overview

        diff --git a/docs/noj_book.automl.html b/docs/noj_book.automl.html index 0afc294..53e82ec 100644 --- a/docs/noj_book.automl.html +++ b/docs/noj_book.automl.html @@ -328,7 +328,7 @@

        my-pipeline
        -
        #object[clojure.core$partial$fn__5927 0x3a2c5042 "clojure.core$partial$fn__5927@3a2c5042"]
        +
        #object[clojure.core$partial$fn__5927 0x8e0742a "clojure.core$partial$fn__5927@8e0742a"]

        This function is metamorph compliant, so it takes a map (my-pipeline {}) and returns a map.

        But this map cannot be “arbitrary”, it need to adhere to the metamorph conventions.

        @@ -367,13 +367,13 @@

        1.0 -3.0 -0.0 -0.0 +1.0 +2.0 +1.0 -0.0 -3.0 +1.0 +1.0 0.0 1.0 @@ -381,49 +381,49 @@

        0.0 2.0 0.0 -0.0 +1.0 0.0 -1.0 +3.0 +0.0 0.0 -1.0 -0.0 1.0 -0.0 -0.0 +1.0 +2.0 +1.0 1.0 -1.0 2.0 +0.0 1.0 0.0 -3.0 -1.0 +2.0 +0.0 0.0 -0.0 1.0 +3.0 +0.0 0.0 -1.0 +0.0 1.0 -3.0 0.0 0.0 0.0 -3.0 -2.0 +1.0 0.0 +1.0 ... @@ -432,8 +432,8 @@

        ... -0.0 1.0 +2.0 0.0 1.0 @@ -444,10 +444,10 @@

        0.0 -1.0 +0.0 3.0 -1.0 -1.0 +2.0 +0.0 0.0 @@ -463,15 +463,15 @@

        0.0 -1.0 +3.0 0.0 0.0 1.0 +3.0 2.0 -2.0 -1.0 +0.0 0.0 @@ -481,20 +481,20 @@

        0.0 -3.0 +1.0 0.0 0.0 -0.0 -3.0 -0.0 -0.0 +1.0 +1.0 +2.0 +1.0 0.0 -3.0 -0.0 +1.0 +2.0 0.0 @@ -504,7 +504,7 @@

        :metamorph/mode :fit
        #uuid "a16e8242-7231-4726-9f42-d2294d725240" {:model-data {:majority-class 1.0, :distinct-labels (0.0 1.0)}, :options {:model-type :metamorph.ml/dummy-classifier}, :id #uuid "fd86059a-0e4e-4eed-80e2-232fe5cb1f6e", :feature-columns [:sex :pclass :embarked], :target-columns [:survived], :target-categorical-maps {:survived #tech.v3.dataset.categorical.CategoricalMap{:lookup-table {"no" 0, "yes" 1}, :src-column :survived, :result-datatype :float64}}, :scicloj.metamorph.ml/unsupervised? nil}

        }

        +
        :metamorph/mode :fit
        #uuid "fd1ac3f3-7f5c-4e69-86b1-4655f1310d53" {:model-data {:majority-class 0.0, :distinct-labels (1.0 0.0)}, :options {:model-type :metamorph.ml/dummy-classifier}, :id #uuid "d19b7a20-e19e-4d60-a5e8-4f363c909c38", :feature-columns [:sex :pclass :embarked], :target-columns [:survived], :target-categorical-maps {:survived #tech.v3.dataset.categorical.CategoricalMap{:lookup-table {"no" 0, "yes" 1}, :src-column :survived, :result-datatype :float64}}, :scicloj.metamorph.ml/unsupervised? nil}

        }

        The ctx contains lots of information, so I only show its top level keys

        (keys ctx-after-train)
        @@ -512,7 +512,7 @@

        (:metamorph/data
          :metamorph/mode
        - #uuid "a16e8242-7231-4726-9f42-d2294d725240")
        + #uuid "fd1ac3f3-7f5c-4e69-86b1-4655f1310d53")

        This context map has the “data”, the “mode” and an UUID for each operation (we had only one in this pipeline)

        @@ -531,13 +531,13 @@

        1.0 -3.0 -0.0 -0.0 +1.0 +2.0 +1.0 -0.0 -3.0 +1.0 +1.0 0.0 1.0 @@ -545,49 +545,49 @@

        0.0 2.0 0.0 -0.0 +1.0 0.0 -1.0 +3.0 +0.0 0.0 -1.0 -0.0 1.0 -0.0 -0.0 +1.0 +2.0 +1.0 1.0 -1.0 2.0 +0.0 1.0 0.0 -3.0 -1.0 +2.0 +0.0 0.0 -0.0 1.0 +3.0 +0.0 0.0 -1.0 +0.0 1.0 -3.0 0.0 0.0 0.0 -3.0 -2.0 +1.0 0.0 +1.0 ... @@ -596,8 +596,8 @@

        ... -0.0 1.0 +2.0 0.0 1.0 @@ -608,10 +608,10 @@

        0.0 -1.0 +0.0 3.0 -1.0 -1.0 +2.0 +0.0 0.0 @@ -627,15 +627,15 @@

        0.0 -1.0 +3.0 0.0 0.0 1.0 +3.0 2.0 -2.0 -1.0 +0.0 0.0 @@ -645,28 +645,28 @@

        0.0 -3.0 +1.0 0.0 0.0 -0.0 -3.0 -0.0 -0.0 +1.0 +1.0 +2.0 +1.0 0.0 -3.0 -0.0 +1.0 +2.0 0.0

        :fit
        -
        {:model-data {:majority-class 1.0, :distinct-labels (0.0 1.0)},
        +
        {:model-data {:majority-class 0.0, :distinct-labels (1.0 0.0)},
          :options {:model-type :metamorph.ml/dummy-classifier},
        - :id #uuid "fd86059a-0e4e-4eed-80e2-232fe5cb1f6e",
        + :id #uuid "d19b7a20-e19e-4d60-a5e8-4f363c909c38",
          :feature-columns [:sex :pclass :embarked],
          :target-columns [:survived],
          :target-categorical-maps
        @@ -690,7 +690,7 @@ 

        (:metamorph/data
          :metamorph/mode
        - #uuid "a16e8242-7231-4726-9f42-d2294d725240")
        + #uuid "fd1ac3f3-7f5c-4e69-86b1-4655f1310d53")

        For the dummy-model we do not see a trained-model, but it “communicates” the majority class from the train data to use it for prediction. So the dummy-model has ‘learned’ the majority class from its training data.

        So we can get prediction result out of the ctx:

        @@ -700,7 +700,7 @@

        #tech.v3.dataset.column<float64>[178]
         :survived
        -[1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000...]
        +[0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000...]

        This works as long as all operations of the pipeline follow the metamorph convention (we can create such compliant functions, out of normal dataset->dataset functions, as we will see)

        my-pipeline represents therefore a not yet executed model training / prediction flow. It can be freely moved around and applied to datasets when needed.

        @@ -723,7 +723,7 @@

        (:metamorph/data
          :metamorph/mode
        - #uuid "56ea7ff8-969d-404f-b03a-a340f8547587")
        + #uuid "5d293179-218e-41d6-b725-8d06e1cc4219")

        To show the power of pipelines, I start with doing the simplest possible pipeline, and expand then on it.

        We can already chain train and test with usual functions:

        @@ -736,7 +736,7 @@

        #tech.v3.dataset.column<float64>[178]
         :survived
        -[1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000...]
        +[0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000...]

        the same with pipelines

        @@ -752,7 +752,7 @@

        #tech.v3.dataset.column<float64>[178]
         :survived
        -[1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000...]
        +[0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000...]

    @@ -788,19 +788,19 @@

    (mm/pipeline ops-1)
    -
    #object[clojure.core$partial$fn__5927 0x7bdb1918 "clojure.core$partial$fn__5927@7bdb1918"]
    +
    #object[clojure.core$partial$fn__5927 0x1244c8b0 "clojure.core$partial$fn__5927@1244c8b0"]
    (mm/pipeline ops-2)
    -
    #object[clojure.core$partial$fn__5927 0x29bfe521 "clojure.core$partial$fn__5927@29bfe521"]
    +
    #object[clojure.core$partial$fn__5927 0x7697e6e5 "clojure.core$partial$fn__5927@7697e6e5"]
    (mm/pipeline ops-3)
    -
    #object[clojure.core$partial$fn__5927 0x10f13284 "clojure.core$partial$fn__5927@10f13284"]
    +
    #object[clojure.core$partial$fn__5927 0x79747166 "clojure.core$partial$fn__5927@79747166"]

    All three can be called as function taking a dataset iwrapped in a ctx

    Pipeline as data is as well supported

    @@ -811,7 +811,7 @@

    (mm/->pipeline op-spec)
    -
    #object[clojure.core$partial$fn__5927 0x473b9ee3 "clojure.core$partial$fn__5927@473b9ee3"]
    +
    #object[clojure.core$partial$fn__5927 0x5d95154e "clojure.core$partial$fn__5927@5d95154e"]

    Creating these functions does not yet execute anything, they are functions which can be executed against a context as part of a metamorph pipeline. Executions are triggered like this:

    @@ -1663,20 +1663,20 @@

    -
    [:sex :pclass :embarked]
    +
    [:sex :pclass]
    -0.7750777629657843 +0.775096806957405
    -
    {:model-type :sklearn.classification/logistic-regression}
    +
    {:model-type :sklearn.classification/random-forest-classifier}
    -
    [:sex :pclass]
    +
    [:sex :pclass :embarked]
    -0.773973211451787 +0.7750777629657843
    -
    {:model-type :sklearn.classification/random-forest-classifier}
    +
    {:model-type :sklearn.classification/logistic-regression}
    diff --git a/docs/noj_book.visualizing_correlation_matrices.html b/docs/noj_book.visualizing_correlation_matrices.html index 339cf9e..d4b925f 100644 --- a/docs/noj_book.visualizing_correlation_matrices.html +++ b/docs/noj_book.visualizing_correlation_matrices.html @@ -546,7 +546,7 @@

    Note the slider control and the tooltips.

    Here is an example with an actual correlation matrix.

    diff --git a/docs/search.json b/docs/search.json index 51b201d..cbdd0f6 100644 --- a/docs/search.json +++ b/docs/search.json @@ -10,11 +10,11 @@ ] }, { - "objectID": "index.html#existing-chapters-in-this-book", - "href": "index.html#existing-chapters-in-this-book", + "objectID": "index.html#chapters-of-this-book", + "href": "index.html#chapters-of-this-book", "title": "Scinojure Documentation", - "section": "1.1 Existing chapters in this book:", - "text": "1.1 Existing chapters in this book:\n\nOverview\n\nUnderlying libraries\nRecommended libraries\nKnown issues ❗\n\nTutorials\n\nDatasets\nMachine learning\nMachine learning specific functionality in tech.ml.dataset\nAutoML using metamorph pipelines\nOrdinary least squares with interactions\nVisualizing correlation matrices (experimental 🛠) - DRAFT\n\n\n\nsource: notebooks/index.clj", + "section": "1.1 Chapters of this book:", + "text": "1.1 Chapters of this book:\n\nOverview\n\nUnderlying libraries\nRecommended libraries\nKnown issues ❗\n\nTutorials\n\nDatasets\nMachine learning\nMachine learning specific functionality in tech.ml.dataset\nAutoML using metamorph pipelines\nOrdinary least squares with interactions\nVisualizing correlation matrices (experimental 🛠) - DRAFT\n\n\n\nsource: notebooks/index.clj", "crumbs": [ "1  Preface" ] @@ -200,7 +200,7 @@ "href": "noj_book.automl.html#the-metamorph-pipeline-abstraction", "title": "8  AutoML using metamorph pipelines", "section": "", - "text": "(require '[scicloj.metamorph.ml :as ml]\n '[scicloj.metamorph.core :as mm]\n '[tablecloth.api :as tc])\n\n\n\n(def titanic ml-basic/numeric-titanic-data)\n\n\n\n(def splits (first (tc/split->seq titanic)))\n\n\n(def train-ds (:train splits))\n\n\n(def test-ds (:test splits))\n\n\n\n\n(def my-pipeline\n (mm/pipeline\n (ml/model {:model-type :metamorph.ml/dummy-classifier})))\n\n\n\nmy-pipeline\n\n\n#object[clojure.core$partial$fn__5927 0x3a2c5042 \"clojure.core$partial$fn__5927@3a2c5042\"]\n\n\n\n\n\n(def ctx-after-train\n (my-pipeline {:metamorph/data train-ds\n :metamorph/mode :fit}))\n\n\nctx-after-train\n\n{\n\n\n\n\n\n\n\n\n:metamorph/data\n\n\n\nGroup: 0 [711 4]:\n\n\n\n:sex\n:pclass\n:embarked\n:survived\n\n\n\n\n1.0\n3.0\n0.0\n0.0\n\n\n0.0\n3.0\n0.0\n1.0\n\n\n0.0\n2.0\n0.0\n0.0\n\n\n0.0\n1.0\n0.0\n1.0\n\n\n0.0\n1.0\n0.0\n0.0\n\n\n1.0\n1.0\n2.0\n1.0\n\n\n0.0\n3.0\n1.0\n0.0\n\n\n0.0\n1.0\n0.0\n1.0\n\n\n1.0\n3.0\n0.0\n0.0\n\n\n0.0\n3.0\n2.0\n0.0\n\n\n...\n...\n...\n...\n\n\n0.0\n1.0\n0.0\n1.0\n\n\n0.0\n2.0\n0.0\n0.0\n\n\n1.0\n3.0\n1.0\n1.0\n\n\n0.0\n1.0\n2.0\n0.0\n\n\n0.0\n3.0\n0.0\n0.0\n\n\n0.0\n1.0\n0.0\n0.0\n\n\n1.0\n2.0\n2.0\n1.0\n\n\n0.0\n3.0\n0.0\n0.0\n\n\n0.0\n3.0\n0.0\n0.0\n\n\n0.0\n3.0\n0.0\n0.0\n\n\n0.0\n3.0\n0.0\n0.0\n\n\n\n\n\n\n\n\n:metamorph/mode :fit#uuid \"a16e8242-7231-4726-9f42-d2294d725240\" {:model-data {:majority-class 1.0, :distinct-labels (0.0 1.0)}, :options {:model-type :metamorph.ml/dummy-classifier}, :id #uuid \"fd86059a-0e4e-4eed-80e2-232fe5cb1f6e\", :feature-columns [:sex :pclass :embarked], :target-columns [:survived], :target-categorical-maps {:survived #tech.v3.dataset.categorical.CategoricalMap{:lookup-table {\"no\" 0, \"yes\" 1}, :src-column :survived, :result-datatype :float64}}, :scicloj.metamorph.ml/unsupervised? nil}}\n\n\n(keys ctx-after-train)\n\n\n(:metamorph/data\n :metamorph/mode\n #uuid \"a16e8242-7231-4726-9f42-d2294d725240\")\n\n\n\n(vals ctx-after-train)\n\n(Group: 0 [711 4]:\n\n\n\n:sex\n:pclass\n:embarked\n:survived\n\n\n\n\n1.0\n3.0\n0.0\n0.0\n\n\n0.0\n3.0\n0.0\n1.0\n\n\n0.0\n2.0\n0.0\n0.0\n\n\n0.0\n1.0\n0.0\n1.0\n\n\n0.0\n1.0\n0.0\n0.0\n\n\n1.0\n1.0\n2.0\n1.0\n\n\n0.0\n3.0\n1.0\n0.0\n\n\n0.0\n1.0\n0.0\n1.0\n\n\n1.0\n3.0\n0.0\n0.0\n\n\n0.0\n3.0\n2.0\n0.0\n\n\n...\n...\n...\n...\n\n\n0.0\n1.0\n0.0\n1.0\n\n\n0.0\n2.0\n0.0\n0.0\n\n\n1.0\n3.0\n1.0\n1.0\n\n\n0.0\n1.0\n2.0\n0.0\n\n\n0.0\n3.0\n0.0\n0.0\n\n\n0.0\n1.0\n0.0\n0.0\n\n\n1.0\n2.0\n2.0\n1.0\n\n\n0.0\n3.0\n0.0\n0.0\n\n\n0.0\n3.0\n0.0\n0.0\n\n\n0.0\n3.0\n0.0\n0.0\n\n\n0.0\n3.0\n0.0\n0.0\n\n\n\n:fit\n{:model-data {:majority-class 1.0, :distinct-labels (0.0 1.0)},\n :options {:model-type :metamorph.ml/dummy-classifier},\n :id #uuid \"fd86059a-0e4e-4eed-80e2-232fe5cb1f6e\",\n :feature-columns [:sex :pclass :embarked],\n :target-columns [:survived],\n :target-categorical-maps\n {:survived\n {:lookup-table {\"no\" 0, \"yes\" 1},\n :src-column :survived,\n :result-datatype :float64}},\n :scicloj.metamorph.ml/unsupervised? nil}\n)\n\n\n\n(def ctx-after-predict\n (my-pipeline (assoc ctx-after-train\n :metamorph/mode :transform\n :metamorph/data test-ds)))\n\n\n(keys ctx-after-predict)\n\n\n(:metamorph/data\n :metamorph/mode\n #uuid \"a16e8242-7231-4726-9f42-d2294d725240\")\n\n\n\n\n(-> ctx-after-predict :metamorph/data :survived)\n\n\n#tech.v3.dataset.column<float64>[178]\n:survived\n[1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000...]", + "text": "(require '[scicloj.metamorph.ml :as ml]\n '[scicloj.metamorph.core :as mm]\n '[tablecloth.api :as tc])\n\n\n\n(def titanic ml-basic/numeric-titanic-data)\n\n\n\n(def splits (first (tc/split->seq titanic)))\n\n\n(def train-ds (:train splits))\n\n\n(def test-ds (:test splits))\n\n\n\n\n(def my-pipeline\n (mm/pipeline\n (ml/model {:model-type :metamorph.ml/dummy-classifier})))\n\n\n\nmy-pipeline\n\n\n#object[clojure.core$partial$fn__5927 0x8e0742a \"clojure.core$partial$fn__5927@8e0742a\"]\n\n\n\n\n\n(def ctx-after-train\n (my-pipeline {:metamorph/data train-ds\n :metamorph/mode :fit}))\n\n\nctx-after-train\n\n{\n\n\n\n\n\n\n\n\n:metamorph/data\n\n\n\nGroup: 0 [711 4]:\n\n\n\n:sex\n:pclass\n:embarked\n:survived\n\n\n\n\n1.0\n1.0\n2.0\n1.0\n\n\n1.0\n1.0\n0.0\n1.0\n\n\n0.0\n2.0\n0.0\n1.0\n\n\n0.0\n3.0\n0.0\n0.0\n\n\n1.0\n1.0\n2.0\n1.0\n\n\n1.0\n2.0\n0.0\n1.0\n\n\n0.0\n2.0\n0.0\n0.0\n\n\n1.0\n3.0\n0.0\n0.0\n\n\n0.0\n1.0\n0.0\n0.0\n\n\n0.0\n1.0\n0.0\n1.0\n\n\n...\n...\n...\n...\n\n\n1.0\n2.0\n0.0\n1.0\n\n\n0.0\n2.0\n0.0\n0.0\n\n\n0.0\n3.0\n2.0\n0.0\n\n\n0.0\n1.0\n2.0\n0.0\n\n\n0.0\n3.0\n0.0\n0.0\n\n\n0.0\n3.0\n0.0\n0.0\n\n\n1.0\n3.0\n2.0\n0.0\n\n\n0.0\n3.0\n0.0\n0.0\n\n\n0.0\n1.0\n0.0\n0.0\n\n\n1.0\n1.0\n2.0\n1.0\n\n\n0.0\n1.0\n2.0\n0.0\n\n\n\n\n\n\n\n\n:metamorph/mode :fit#uuid \"fd1ac3f3-7f5c-4e69-86b1-4655f1310d53\" {:model-data {:majority-class 0.0, :distinct-labels (1.0 0.0)}, :options {:model-type :metamorph.ml/dummy-classifier}, :id #uuid \"d19b7a20-e19e-4d60-a5e8-4f363c909c38\", :feature-columns [:sex :pclass :embarked], :target-columns [:survived], :target-categorical-maps {:survived #tech.v3.dataset.categorical.CategoricalMap{:lookup-table {\"no\" 0, \"yes\" 1}, :src-column :survived, :result-datatype :float64}}, :scicloj.metamorph.ml/unsupervised? nil}}\n\n\n(keys ctx-after-train)\n\n\n(:metamorph/data\n :metamorph/mode\n #uuid \"fd1ac3f3-7f5c-4e69-86b1-4655f1310d53\")\n\n\n\n(vals ctx-after-train)\n\n(Group: 0 [711 4]:\n\n\n\n:sex\n:pclass\n:embarked\n:survived\n\n\n\n\n1.0\n1.0\n2.0\n1.0\n\n\n1.0\n1.0\n0.0\n1.0\n\n\n0.0\n2.0\n0.0\n1.0\n\n\n0.0\n3.0\n0.0\n0.0\n\n\n1.0\n1.0\n2.0\n1.0\n\n\n1.0\n2.0\n0.0\n1.0\n\n\n0.0\n2.0\n0.0\n0.0\n\n\n1.0\n3.0\n0.0\n0.0\n\n\n0.0\n1.0\n0.0\n0.0\n\n\n0.0\n1.0\n0.0\n1.0\n\n\n...\n...\n...\n...\n\n\n1.0\n2.0\n0.0\n1.0\n\n\n0.0\n2.0\n0.0\n0.0\n\n\n0.0\n3.0\n2.0\n0.0\n\n\n0.0\n1.0\n2.0\n0.0\n\n\n0.0\n3.0\n0.0\n0.0\n\n\n0.0\n3.0\n0.0\n0.0\n\n\n1.0\n3.0\n2.0\n0.0\n\n\n0.0\n3.0\n0.0\n0.0\n\n\n0.0\n1.0\n0.0\n0.0\n\n\n1.0\n1.0\n2.0\n1.0\n\n\n0.0\n1.0\n2.0\n0.0\n\n\n\n:fit\n{:model-data {:majority-class 0.0, :distinct-labels (1.0 0.0)},\n :options {:model-type :metamorph.ml/dummy-classifier},\n :id #uuid \"d19b7a20-e19e-4d60-a5e8-4f363c909c38\",\n :feature-columns [:sex :pclass :embarked],\n :target-columns [:survived],\n :target-categorical-maps\n {:survived\n {:lookup-table {\"no\" 0, \"yes\" 1},\n :src-column :survived,\n :result-datatype :float64}},\n :scicloj.metamorph.ml/unsupervised? nil}\n)\n\n\n\n(def ctx-after-predict\n (my-pipeline (assoc ctx-after-train\n :metamorph/mode :transform\n :metamorph/data test-ds)))\n\n\n(keys ctx-after-predict)\n\n\n(:metamorph/data\n :metamorph/mode\n #uuid \"fd1ac3f3-7f5c-4e69-86b1-4655f1310d53\")\n\n\n\n\n(-> ctx-after-predict :metamorph/data :survived)\n\n\n#tech.v3.dataset.column<float64>[178]\n:survived\n[0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000...]", "crumbs": [ "Tutorials", "8  AutoML using metamorph pipelines" @@ -211,7 +211,7 @@ "href": "noj_book.automl.html#use-metamorph-pipelines-to-do-model-training-with-higher-level-api", "title": "8  AutoML using metamorph pipelines", "section": "8.2 Use metamorph pipelines to do model training with higher level API", - "text": "8.2 Use metamorph pipelines to do model training with higher level API\nAs user of metamorph.ml we do not need to deal with this low-level details of how metamorph works, we have convenience functions which hide this.\nThe following code will do the same as train, but return a context object, which contains the trained model, so it will execute the pipeline, and not only create it.\nIt uses a convenience function mm/fit which generates compliant context maps internally and executes the pipeline as well.\nThe ctx acts a collector of everything “learned” during :fit, mainly the trained model, but it could be as well other information learned from the data during :fit and to be applied at :transform .\n\n(def train-ctx\n (mm/fit titanic\n (ml/model {:model-type :metamorph.ml/dummy-classifier})))\n\n(The dummy-classifier model does not have a lot of state, so there is little to see)\n\n(keys train-ctx)\n\n\n(:metamorph/data\n :metamorph/mode\n #uuid \"56ea7ff8-969d-404f-b03a-a340f8547587\")\n\nTo show the power of pipelines, I start with doing the simplest possible pipeline, and expand then on it.\nWe can already chain train and test with usual functions:\n\n(->>\n (ml/train train-ds {:model-type :metamorph.ml/dummy-classifier})\n (ml/predict test-ds)\n :survived)\n\n\n#tech.v3.dataset.column<float64>[178]\n:survived\n[1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000...]\n\nthe same with pipelines\n\n(def pipeline\n (mm/pipeline (ml/model {:model-type :metamorph.ml/dummy-classifier})))\n\n\n(->>\n (mm/fit-pipe train-ds pipeline)\n (mm/transform-pipe test-ds pipeline)\n :metamorph/data :survived)\n\n\n#tech.v3.dataset.column<float64>[178]\n:survived\n[1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000...]", + "text": "8.2 Use metamorph pipelines to do model training with higher level API\nAs user of metamorph.ml we do not need to deal with this low-level details of how metamorph works, we have convenience functions which hide this.\nThe following code will do the same as train, but return a context object, which contains the trained model, so it will execute the pipeline, and not only create it.\nIt uses a convenience function mm/fit which generates compliant context maps internally and executes the pipeline as well.\nThe ctx acts a collector of everything “learned” during :fit, mainly the trained model, but it could be as well other information learned from the data during :fit and to be applied at :transform .\n\n(def train-ctx\n (mm/fit titanic\n (ml/model {:model-type :metamorph.ml/dummy-classifier})))\n\n(The dummy-classifier model does not have a lot of state, so there is little to see)\n\n(keys train-ctx)\n\n\n(:metamorph/data\n :metamorph/mode\n #uuid \"5d293179-218e-41d6-b725-8d06e1cc4219\")\n\nTo show the power of pipelines, I start with doing the simplest possible pipeline, and expand then on it.\nWe can already chain train and test with usual functions:\n\n(->>\n (ml/train train-ds {:model-type :metamorph.ml/dummy-classifier})\n (ml/predict test-ds)\n :survived)\n\n\n#tech.v3.dataset.column<float64>[178]\n:survived\n[0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000...]\n\nthe same with pipelines\n\n(def pipeline\n (mm/pipeline (ml/model {:model-type :metamorph.ml/dummy-classifier})))\n\n\n(->>\n (mm/fit-pipe train-ds pipeline)\n (mm/transform-pipe test-ds pipeline)\n :metamorph/data :survived)\n\n\n#tech.v3.dataset.column<float64>[178]\n:survived\n[0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000...]", "crumbs": [ "Tutorials", "8  AutoML using metamorph pipelines" @@ -222,7 +222,7 @@ "href": "noj_book.automl.html#create-metamorph-compliant-functions", "title": "8  AutoML using metamorph pipelines", "section": "8.3 Create metamorph compliant functions", - "text": "8.3 Create metamorph compliant functions\nAs said before, a metamorph pipeline is composed of metamorph compliant functions / operations, which take as input and output the ctx. There are three ways to create those.\nThe following three expressions create the same metamorph compliant function\n\nimplementing a metamorph compliant function directly via anonymous function\n\n\n(def ops-1\n (fn [ctx]\n (assoc ctx :metamorph/data\n (tc/drop-columns (:metamorph/data ctx) [:embarked]))))\n\n\nusing mm/lift which does the same as 1.\n\n\n(def ops-2 (mm/lift tc/drop-columns [:embarked]))\n\n\nusing a name-space containing lifted functions\n\n\n(require '[tablecloth.pipeline])\n\n\n(def ops-3 (tablecloth.pipeline/drop-columns [:embarked]))\n\nAll three create the same pipeline op and can be used to make a pipeline\n\n(mm/pipeline ops-1)\n\n\n#object[clojure.core$partial$fn__5927 0x7bdb1918 \"clojure.core$partial$fn__5927@7bdb1918\"]\n\n\n(mm/pipeline ops-2)\n\n\n#object[clojure.core$partial$fn__5927 0x29bfe521 \"clojure.core$partial$fn__5927@29bfe521\"]\n\n\n(mm/pipeline ops-3)\n\n\n#object[clojure.core$partial$fn__5927 0x10f13284 \"clojure.core$partial$fn__5927@10f13284\"]\n\nAll three can be called as function taking a dataset iwrapped in a ctx\nPipeline as data is as well supported\n\n(def op-spec [[ml/model {:model-type :metamorph.ml/dummy-classifier}]])\n\n\n(mm/->pipeline op-spec)\n\n\n#object[clojure.core$partial$fn__5927 0x473b9ee3 \"clojure.core$partial$fn__5927@473b9ee3\"]\n\nCreating these functions does not yet execute anything, they are functions which can be executed against a context as part of a metamorph pipeline. Executions are triggered like this:\n\n(ops-1 {:metamorph/data titanic})\n\n{\n\n\n\n\n\n\n\n\n:metamorph/data\n\n\n\n_unnamed [889 3]:\n\n\n\n:sex\n:pclass\n:survived\n\n\n\n\n0.0\n3.0\n0.0\n\n\n1.0\n1.0\n1.0\n\n\n1.0\n3.0\n1.0\n\n\n1.0\n1.0\n1.0\n\n\n0.0\n3.0\n0.0\n\n\n0.0\n3.0\n0.0\n\n\n0.0\n1.0\n0.0\n\n\n0.0\n3.0\n0.0\n\n\n1.0\n3.0\n1.0\n\n\n1.0\n2.0\n1.0\n\n\n...\n...\n...\n\n\n1.0\n2.0\n1.0\n\n\n0.0\n3.0\n0.0\n\n\n1.0\n3.0\n0.0\n\n\n0.0\n2.0\n0.0\n\n\n0.0\n3.0\n0.0\n\n\n1.0\n3.0\n0.0\n\n\n0.0\n2.0\n0.0\n\n\n1.0\n1.0\n1.0\n\n\n1.0\n3.0\n0.0\n\n\n0.0\n1.0\n1.0\n\n\n0.0\n3.0\n0.0\n\n\n\n\n\n\n\n\n}\n\n(ops-2 {:metamorph/data titanic})\n\n{\n\n\n\n\n\n\n\n\n:metamorph/data\n\n\n\n_unnamed [889 3]:\n\n\n\n:sex\n:pclass\n:survived\n\n\n\n\n0.0\n3.0\n0.0\n\n\n1.0\n1.0\n1.0\n\n\n1.0\n3.0\n1.0\n\n\n1.0\n1.0\n1.0\n\n\n0.0\n3.0\n0.0\n\n\n0.0\n3.0\n0.0\n\n\n0.0\n1.0\n0.0\n\n\n0.0\n3.0\n0.0\n\n\n1.0\n3.0\n1.0\n\n\n1.0\n2.0\n1.0\n\n\n...\n...\n...\n\n\n1.0\n2.0\n1.0\n\n\n0.0\n3.0\n0.0\n\n\n1.0\n3.0\n0.0\n\n\n0.0\n2.0\n0.0\n\n\n0.0\n3.0\n0.0\n\n\n1.0\n3.0\n0.0\n\n\n0.0\n2.0\n0.0\n\n\n1.0\n1.0\n1.0\n\n\n1.0\n3.0\n0.0\n\n\n0.0\n1.0\n1.0\n\n\n0.0\n3.0\n0.0\n\n\n\n\n\n\n\n\n}\n\n(ops-3 {:metamorph/data titanic})\n\n{\n\n\n\n\n\n\n\n\n:metamorph/data\n\n\n\n_unnamed [889 3]:\n\n\n\n:sex\n:pclass\n:survived\n\n\n\n\n0.0\n3.0\n0.0\n\n\n1.0\n1.0\n1.0\n\n\n1.0\n3.0\n1.0\n\n\n1.0\n1.0\n1.0\n\n\n0.0\n3.0\n0.0\n\n\n0.0\n3.0\n0.0\n\n\n0.0\n1.0\n0.0\n\n\n0.0\n3.0\n0.0\n\n\n1.0\n3.0\n1.0\n\n\n1.0\n2.0\n1.0\n\n\n...\n...\n...\n\n\n1.0\n2.0\n1.0\n\n\n0.0\n3.0\n0.0\n\n\n1.0\n3.0\n0.0\n\n\n0.0\n2.0\n0.0\n\n\n0.0\n3.0\n0.0\n\n\n1.0\n3.0\n0.0\n\n\n0.0\n2.0\n0.0\n\n\n1.0\n1.0\n1.0\n\n\n1.0\n3.0\n0.0\n\n\n0.0\n1.0\n1.0\n\n\n0.0\n3.0\n0.0\n\n\n\n\n\n\n\n\n}\nThe mm/lift function transforms any dataset->dataset function into a ctx->ctx function, while using the metamorph convention, as required for metamorph pipeline operations\nFor convenience tablecloth contains a ns where all dataset->dataset functions are lifted into ctx->ctx operations, so can be added to pipelines directly without using lift.\nSo a metamorph pipeline can encapsulate arbitrary transformation of a dataset in the 2 modes. They can be “stateless” (only chaining the dataset, such as drop-columns) or “state-full”, so they store data in the ctx during :fit and can use it in :transform. In the pipeline above, the trained model is stored in this way.\nThis state is not stored globally, but inside the pipeline so this makes pipeline execution “isolated”.\nSo now we can add more operations to the pipeline, and nothing else changes, for example drop columns.", + "text": "8.3 Create metamorph compliant functions\nAs said before, a metamorph pipeline is composed of metamorph compliant functions / operations, which take as input and output the ctx. There are three ways to create those.\nThe following three expressions create the same metamorph compliant function\n\nimplementing a metamorph compliant function directly via anonymous function\n\n\n(def ops-1\n (fn [ctx]\n (assoc ctx :metamorph/data\n (tc/drop-columns (:metamorph/data ctx) [:embarked]))))\n\n\nusing mm/lift which does the same as 1.\n\n\n(def ops-2 (mm/lift tc/drop-columns [:embarked]))\n\n\nusing a name-space containing lifted functions\n\n\n(require '[tablecloth.pipeline])\n\n\n(def ops-3 (tablecloth.pipeline/drop-columns [:embarked]))\n\nAll three create the same pipeline op and can be used to make a pipeline\n\n(mm/pipeline ops-1)\n\n\n#object[clojure.core$partial$fn__5927 0x1244c8b0 \"clojure.core$partial$fn__5927@1244c8b0\"]\n\n\n(mm/pipeline ops-2)\n\n\n#object[clojure.core$partial$fn__5927 0x7697e6e5 \"clojure.core$partial$fn__5927@7697e6e5\"]\n\n\n(mm/pipeline ops-3)\n\n\n#object[clojure.core$partial$fn__5927 0x79747166 \"clojure.core$partial$fn__5927@79747166\"]\n\nAll three can be called as function taking a dataset iwrapped in a ctx\nPipeline as data is as well supported\n\n(def op-spec [[ml/model {:model-type :metamorph.ml/dummy-classifier}]])\n\n\n(mm/->pipeline op-spec)\n\n\n#object[clojure.core$partial$fn__5927 0x5d95154e \"clojure.core$partial$fn__5927@5d95154e\"]\n\nCreating these functions does not yet execute anything, they are functions which can be executed against a context as part of a metamorph pipeline. Executions are triggered like this:\n\n(ops-1 {:metamorph/data titanic})\n\n{\n\n\n\n\n\n\n\n\n:metamorph/data\n\n\n\n_unnamed [889 3]:\n\n\n\n:sex\n:pclass\n:survived\n\n\n\n\n0.0\n3.0\n0.0\n\n\n1.0\n1.0\n1.0\n\n\n1.0\n3.0\n1.0\n\n\n1.0\n1.0\n1.0\n\n\n0.0\n3.0\n0.0\n\n\n0.0\n3.0\n0.0\n\n\n0.0\n1.0\n0.0\n\n\n0.0\n3.0\n0.0\n\n\n1.0\n3.0\n1.0\n\n\n1.0\n2.0\n1.0\n\n\n...\n...\n...\n\n\n1.0\n2.0\n1.0\n\n\n0.0\n3.0\n0.0\n\n\n1.0\n3.0\n0.0\n\n\n0.0\n2.0\n0.0\n\n\n0.0\n3.0\n0.0\n\n\n1.0\n3.0\n0.0\n\n\n0.0\n2.0\n0.0\n\n\n1.0\n1.0\n1.0\n\n\n1.0\n3.0\n0.0\n\n\n0.0\n1.0\n1.0\n\n\n0.0\n3.0\n0.0\n\n\n\n\n\n\n\n\n}\n\n(ops-2 {:metamorph/data titanic})\n\n{\n\n\n\n\n\n\n\n\n:metamorph/data\n\n\n\n_unnamed [889 3]:\n\n\n\n:sex\n:pclass\n:survived\n\n\n\n\n0.0\n3.0\n0.0\n\n\n1.0\n1.0\n1.0\n\n\n1.0\n3.0\n1.0\n\n\n1.0\n1.0\n1.0\n\n\n0.0\n3.0\n0.0\n\n\n0.0\n3.0\n0.0\n\n\n0.0\n1.0\n0.0\n\n\n0.0\n3.0\n0.0\n\n\n1.0\n3.0\n1.0\n\n\n1.0\n2.0\n1.0\n\n\n...\n...\n...\n\n\n1.0\n2.0\n1.0\n\n\n0.0\n3.0\n0.0\n\n\n1.0\n3.0\n0.0\n\n\n0.0\n2.0\n0.0\n\n\n0.0\n3.0\n0.0\n\n\n1.0\n3.0\n0.0\n\n\n0.0\n2.0\n0.0\n\n\n1.0\n1.0\n1.0\n\n\n1.0\n3.0\n0.0\n\n\n0.0\n1.0\n1.0\n\n\n0.0\n3.0\n0.0\n\n\n\n\n\n\n\n\n}\n\n(ops-3 {:metamorph/data titanic})\n\n{\n\n\n\n\n\n\n\n\n:metamorph/data\n\n\n\n_unnamed [889 3]:\n\n\n\n:sex\n:pclass\n:survived\n\n\n\n\n0.0\n3.0\n0.0\n\n\n1.0\n1.0\n1.0\n\n\n1.0\n3.0\n1.0\n\n\n1.0\n1.0\n1.0\n\n\n0.0\n3.0\n0.0\n\n\n0.0\n3.0\n0.0\n\n\n0.0\n1.0\n0.0\n\n\n0.0\n3.0\n0.0\n\n\n1.0\n3.0\n1.0\n\n\n1.0\n2.0\n1.0\n\n\n...\n...\n...\n\n\n1.0\n2.0\n1.0\n\n\n0.0\n3.0\n0.0\n\n\n1.0\n3.0\n0.0\n\n\n0.0\n2.0\n0.0\n\n\n0.0\n3.0\n0.0\n\n\n1.0\n3.0\n0.0\n\n\n0.0\n2.0\n0.0\n\n\n1.0\n1.0\n1.0\n\n\n1.0\n3.0\n0.0\n\n\n0.0\n1.0\n1.0\n\n\n0.0\n3.0\n0.0\n\n\n\n\n\n\n\n\n}\nThe mm/lift function transforms any dataset->dataset function into a ctx->ctx function, while using the metamorph convention, as required for metamorph pipeline operations\nFor convenience tablecloth contains a ns where all dataset->dataset functions are lifted into ctx->ctx operations, so can be added to pipelines directly without using lift.\nSo a metamorph pipeline can encapsulate arbitrary transformation of a dataset in the 2 modes. They can be “stateless” (only chaining the dataset, such as drop-columns) or “state-full”, so they store data in the ctx during :fit and can use it in :transform. In the pipeline above, the trained model is stored in this way.\nThis state is not stored globally, but inside the pipeline so this makes pipeline execution “isolated”.\nSo now we can add more operations to the pipeline, and nothing else changes, for example drop columns.", "crumbs": [ "Tutorials", "8  AutoML using metamorph pipelines" @@ -244,7 +244,7 @@ "href": "noj_book.automl.html#finding-the-best-model-automatically", "title": "8  AutoML using metamorph pipelines", "section": "8.5 Finding the best model automatically", - "text": "8.5 Finding the best model automatically\nThe advantage of the pipelines is even more visible, if we want to have configurable pipelines, and do a grid search to find optimal settings.\nthe following will find the best model across:\n\n4 different model classes\n6 different selections of used features\nk-cross validate this with different test / train splits\n\n\n(defn make-pipe-fn [model-spec features]\n (mm/pipeline\n ;; store the used features in ctx, so we can retrieve them at the end\n (fn [ctx]\n (assoc ctx :used-features features))\n (mm/lift tc/select-columns (conj features :survived))\n {:metamorph/id :model} (ml/model model-spec)))\n\nCreate a 5-K cross validation split of the data:\n\n(def titanic-k-fold (tc/split->seq ml-basic/numeric-titanic-data :kfold {:seed 12345}))\n\n\n(-> titanic-k-fold count)\n\n\n5\n\nThe list of the model types we want to try:\n\n(def models [{ :model-type :xgboost/classification\n :round 10}\n {:model-type :sklearn.classification/decision-tree-classifier}\n {:model-type :sklearn.classification/logistic-regression}\n {:model-type :sklearn.classification/random-forest-classifier}\n {:model-type :metamorph.ml/dummy-classifier}\n {:model-type :scicloj.ml.tribuo/classification\n :tribuo-components [{:name \"logistic\"\n :type \"org.tribuo.classification.sgd.linear.LinearSGDTrainer\"}]\n :tribuo-trainer-name \"logistic\"}\n {:model-type :scicloj.ml.tribuo/classification\n :tribuo-components [{:name \"random-forest\"\n :type \"org.tribuo.classification.dtree.CARTClassificationTrainer\"\n :properties {:maxDepth \"8\"\n :useRandomSplitPoints \"false\"\n :fractionFeaturesInSplit \"0.5\"}}]\n :tribuo-trainer-name \"random-forest\"}])\n\nThis uses models from Smile and Tribuo, but could be any metamorph.ml compliant model ( library sklearn-clj wraps all python sklearn models, for example)\nThe list of feature combinations to try for each model:\n\n(def feature-combinations\n [[:sex :pclass :embarked]\n [:sex]\n [:pclass :embarked]\n [:embarked]\n [:sex :embarked]\n [:sex :pclass]])\n\ngenerate 24 pipeline functions:\n\n(def pipe-fns\n (for [model models\n feature-combination feature-combinations]\n (make-pipe-fn model feature-combination)))\n\n\n(count pipe-fns)\n\n\n42\n\nExecute all pipelines for all splits in the cross-validations and return best model by classification-accuracy\n\n(def evaluation-results\n (ml/evaluate-pipelines\n pipe-fns\n titanic-k-fold\n loss/classification-accuracy\n :accuracy))\n\nBy default it returns the best mode only\n\n(make-results-ds evaluation-results)\n\n\n_unnamed [1 3]:\n\n\n\n\n\n\n\n\n:used-features\n:mean-accuracy\n:options\n\n\n\n\n[:sex :pclass :embarked]\n0.81107726\n{:model-type :scicloj.ml.tribuo/classification,\n\n\n\n\n:tribuo-components\n\n\n\n\n[{:name random-forest,\n\n\n\n\n:type org.tribuo.classification.dtree.CARTClassificationTrainer,\n\n\n\n\n:properties\n\n\n\n\n{:maxDepth 8,\n\n\n\n\n:useRandomSplitPoints false,\n\n\n\n\n:fractionFeaturesInSplit 0.5}}],\n\n\n\n\n:tribuo-trainer-name random-forest}\n\n\n\n\nThe key observation is here, that the metamorph pipelines allow to not only grid-search over the model hyper-parameters, but as well over arbitrary pipeline variations, like which features to include. Both get handled in the same way.\nWe can get all results as well:\n\n(def evaluation-results-all\n (ml/evaluate-pipelines\n pipe-fns\n titanic-k-fold\n loss/classification-accuracy\n :accuracy\n {:map-fn :map\n :return-best-crossvalidation-only false\n :return-best-pipeline-only false}))\n\nIn total it creates and evaluates 4 models * 6 feature configurations * 5 CV = 120 models\n\n(-> evaluation-results-all flatten count)\n\n\n210\n\nWe can find the best as well by hand, it’s the first from the list, when sorted by accuracy.\n\n(-> (make-results-ds evaluation-results-all)\n (tc/unique-by)\n (tc/order-by [:mean-accuracy] :desc)\n (tc/head 20)\n (kind/table))\n\n\n\n\n\n\n\n\n\n\n\nused-features\nmean-accuracy\noptions\n\n\n\n\n\n[:sex :pclass :embarked]\n\n0.8110772551260077\n\n{:model-type :sklearn.classification/random-forest-classifier}\n\n\n\n\n[:sex :pclass :embarked]\n\n0.8110772551260077\n\n{:model-type :sklearn.classification/decision-tree-classifier}\n\n\n\n\n[:sex :pclass :embarked]\n\n0.8110772551260077\n\n{:model-type :xgboost/classification, :round 10}\n\n\n\n\n[:sex :pclass :embarked]\n\n0.8110772551260077\n\n{:model-type :scicloj.ml.tribuo/classification,\n :tribuo-components\n [{:name \"random-forest\",\n :type \"org.tribuo.classification.dtree.CARTClassificationTrainer\",\n :properties\n {:maxDepth \"8\",\n :useRandomSplitPoints \"false\",\n :fractionFeaturesInSplit \"0.5\"}}],\n :tribuo-trainer-name \"random-forest\"}\n\n\n\n\n[:sex :pclass]\n\n0.7863327620135847\n\n{:model-type :scicloj.ml.tribuo/classification,\n :tribuo-components\n [{:name \"logistic\",\n :type \"org.tribuo.classification.sgd.linear.LinearSGDTrainer\"}],\n :tribuo-trainer-name \"logistic\"}\n\n\n\n\n[:sex :embarked]\n\n0.7863327620135847\n\n{:model-type :scicloj.ml.tribuo/classification,\n :tribuo-components\n [{:name \"logistic\",\n :type \"org.tribuo.classification.sgd.linear.LinearSGDTrainer\"}],\n :tribuo-trainer-name \"logistic\"}\n\n\n\n\n[:sex]\n\n0.7863327620135847\n\n{:model-type :scicloj.ml.tribuo/classification,\n :tribuo-components\n [{:name \"logistic\",\n :type \"org.tribuo.classification.sgd.linear.LinearSGDTrainer\"}],\n :tribuo-trainer-name \"logistic\"}\n\n\n\n\n[:sex :embarked]\n\n0.7863327620135847\n\n{:model-type :sklearn.classification/random-forest-classifier}\n\n\n\n\n[:sex]\n\n0.7863327620135847\n\n{:model-type :sklearn.classification/random-forest-classifier}\n\n\n\n\n[:sex :pclass]\n\n0.7863327620135847\n\n{:model-type :sklearn.classification/logistic-regression}\n\n\n\n\n[:sex :embarked]\n\n0.7863327620135847\n\n{:model-type :sklearn.classification/logistic-regression}\n\n\n\n\n[:sex]\n\n0.7863327620135847\n\n{:model-type :sklearn.classification/logistic-regression}\n\n\n\n\n[:sex :embarked]\n\n0.7863327620135847\n\n{:model-type :sklearn.classification/decision-tree-classifier}\n\n\n\n\n[:sex]\n\n0.7863327620135847\n\n{:model-type :xgboost/classification, :round 10}\n\n\n\n\n[:sex :embarked]\n\n0.7863327620135847\n\n{:model-type :xgboost/classification, :round 10}\n\n\n\n\n[:sex]\n\n0.7863327620135847\n\n{:model-type :sklearn.classification/decision-tree-classifier}\n\n\n\n\n[:sex]\n\n0.7863327620135847\n\n{:model-type :scicloj.ml.tribuo/classification,\n :tribuo-components\n [{:name \"random-forest\",\n :type \"org.tribuo.classification.dtree.CARTClassificationTrainer\",\n :properties\n {:maxDepth \"8\",\n :useRandomSplitPoints \"false\",\n :fractionFeaturesInSplit \"0.5\"}}],\n :tribuo-trainer-name \"random-forest\"}\n\n\n\n\n[:sex :pclass :embarked]\n\n0.7852091665079668\n\n{:model-type :scicloj.ml.tribuo/classification,\n :tribuo-components\n [{:name \"logistic\",\n :type \"org.tribuo.classification.sgd.linear.LinearSGDTrainer\"}],\n :tribuo-trainer-name \"logistic\"}\n\n\n\n\n[:sex :pclass :embarked]\n\n0.7750777629657843\n\n{:model-type :sklearn.classification/logistic-regression}\n\n\n\n\n[:sex :pclass]\n\n0.773973211451787\n\n{:model-type :sklearn.classification/random-forest-classifier}", + "text": "8.5 Finding the best model automatically\nThe advantage of the pipelines is even more visible, if we want to have configurable pipelines, and do a grid search to find optimal settings.\nthe following will find the best model across:\n\n4 different model classes\n6 different selections of used features\nk-cross validate this with different test / train splits\n\n\n(defn make-pipe-fn [model-spec features]\n (mm/pipeline\n ;; store the used features in ctx, so we can retrieve them at the end\n (fn [ctx]\n (assoc ctx :used-features features))\n (mm/lift tc/select-columns (conj features :survived))\n {:metamorph/id :model} (ml/model model-spec)))\n\nCreate a 5-K cross validation split of the data:\n\n(def titanic-k-fold (tc/split->seq ml-basic/numeric-titanic-data :kfold {:seed 12345}))\n\n\n(-> titanic-k-fold count)\n\n\n5\n\nThe list of the model types we want to try:\n\n(def models [{ :model-type :xgboost/classification\n :round 10}\n {:model-type :sklearn.classification/decision-tree-classifier}\n {:model-type :sklearn.classification/logistic-regression}\n {:model-type :sklearn.classification/random-forest-classifier}\n {:model-type :metamorph.ml/dummy-classifier}\n {:model-type :scicloj.ml.tribuo/classification\n :tribuo-components [{:name \"logistic\"\n :type \"org.tribuo.classification.sgd.linear.LinearSGDTrainer\"}]\n :tribuo-trainer-name \"logistic\"}\n {:model-type :scicloj.ml.tribuo/classification\n :tribuo-components [{:name \"random-forest\"\n :type \"org.tribuo.classification.dtree.CARTClassificationTrainer\"\n :properties {:maxDepth \"8\"\n :useRandomSplitPoints \"false\"\n :fractionFeaturesInSplit \"0.5\"}}]\n :tribuo-trainer-name \"random-forest\"}])\n\nThis uses models from Smile and Tribuo, but could be any metamorph.ml compliant model ( library sklearn-clj wraps all python sklearn models, for example)\nThe list of feature combinations to try for each model:\n\n(def feature-combinations\n [[:sex :pclass :embarked]\n [:sex]\n [:pclass :embarked]\n [:embarked]\n [:sex :embarked]\n [:sex :pclass]])\n\ngenerate 24 pipeline functions:\n\n(def pipe-fns\n (for [model models\n feature-combination feature-combinations]\n (make-pipe-fn model feature-combination)))\n\n\n(count pipe-fns)\n\n\n42\n\nExecute all pipelines for all splits in the cross-validations and return best model by classification-accuracy\n\n(def evaluation-results\n (ml/evaluate-pipelines\n pipe-fns\n titanic-k-fold\n loss/classification-accuracy\n :accuracy))\n\nBy default it returns the best mode only\n\n(make-results-ds evaluation-results)\n\n\n_unnamed [1 3]:\n\n\n\n\n\n\n\n\n:used-features\n:mean-accuracy\n:options\n\n\n\n\n[:sex :pclass :embarked]\n0.81107726\n{:model-type :scicloj.ml.tribuo/classification,\n\n\n\n\n:tribuo-components\n\n\n\n\n[{:name random-forest,\n\n\n\n\n:type org.tribuo.classification.dtree.CARTClassificationTrainer,\n\n\n\n\n:properties\n\n\n\n\n{:maxDepth 8,\n\n\n\n\n:useRandomSplitPoints false,\n\n\n\n\n:fractionFeaturesInSplit 0.5}}],\n\n\n\n\n:tribuo-trainer-name random-forest}\n\n\n\n\nThe key observation is here, that the metamorph pipelines allow to not only grid-search over the model hyper-parameters, but as well over arbitrary pipeline variations, like which features to include. Both get handled in the same way.\nWe can get all results as well:\n\n(def evaluation-results-all\n (ml/evaluate-pipelines\n pipe-fns\n titanic-k-fold\n loss/classification-accuracy\n :accuracy\n {:map-fn :map\n :return-best-crossvalidation-only false\n :return-best-pipeline-only false}))\n\nIn total it creates and evaluates 4 models * 6 feature configurations * 5 CV = 120 models\n\n(-> evaluation-results-all flatten count)\n\n\n210\n\nWe can find the best as well by hand, it’s the first from the list, when sorted by accuracy.\n\n(-> (make-results-ds evaluation-results-all)\n (tc/unique-by)\n (tc/order-by [:mean-accuracy] :desc)\n (tc/head 20)\n (kind/table))\n\n\n\n\n\n\n\n\n\n\n\nused-features\nmean-accuracy\noptions\n\n\n\n\n\n[:sex :pclass :embarked]\n\n0.8110772551260077\n\n{:model-type :sklearn.classification/random-forest-classifier}\n\n\n\n\n[:sex :pclass :embarked]\n\n0.8110772551260077\n\n{:model-type :sklearn.classification/decision-tree-classifier}\n\n\n\n\n[:sex :pclass :embarked]\n\n0.8110772551260077\n\n{:model-type :xgboost/classification, :round 10}\n\n\n\n\n[:sex :pclass :embarked]\n\n0.8110772551260077\n\n{:model-type :scicloj.ml.tribuo/classification,\n :tribuo-components\n [{:name \"random-forest\",\n :type \"org.tribuo.classification.dtree.CARTClassificationTrainer\",\n :properties\n {:maxDepth \"8\",\n :useRandomSplitPoints \"false\",\n :fractionFeaturesInSplit \"0.5\"}}],\n :tribuo-trainer-name \"random-forest\"}\n\n\n\n\n[:sex :pclass]\n\n0.7863327620135847\n\n{:model-type :scicloj.ml.tribuo/classification,\n :tribuo-components\n [{:name \"logistic\",\n :type \"org.tribuo.classification.sgd.linear.LinearSGDTrainer\"}],\n :tribuo-trainer-name \"logistic\"}\n\n\n\n\n[:sex :embarked]\n\n0.7863327620135847\n\n{:model-type :scicloj.ml.tribuo/classification,\n :tribuo-components\n [{:name \"logistic\",\n :type \"org.tribuo.classification.sgd.linear.LinearSGDTrainer\"}],\n :tribuo-trainer-name \"logistic\"}\n\n\n\n\n[:sex]\n\n0.7863327620135847\n\n{:model-type :scicloj.ml.tribuo/classification,\n :tribuo-components\n [{:name \"logistic\",\n :type \"org.tribuo.classification.sgd.linear.LinearSGDTrainer\"}],\n :tribuo-trainer-name \"logistic\"}\n\n\n\n\n[:sex :embarked]\n\n0.7863327620135847\n\n{:model-type :sklearn.classification/random-forest-classifier}\n\n\n\n\n[:sex]\n\n0.7863327620135847\n\n{:model-type :sklearn.classification/random-forest-classifier}\n\n\n\n\n[:sex :pclass]\n\n0.7863327620135847\n\n{:model-type :sklearn.classification/logistic-regression}\n\n\n\n\n[:sex :embarked]\n\n0.7863327620135847\n\n{:model-type :sklearn.classification/logistic-regression}\n\n\n\n\n[:sex]\n\n0.7863327620135847\n\n{:model-type :sklearn.classification/logistic-regression}\n\n\n\n\n[:sex :embarked]\n\n0.7863327620135847\n\n{:model-type :sklearn.classification/decision-tree-classifier}\n\n\n\n\n[:sex]\n\n0.7863327620135847\n\n{:model-type :xgboost/classification, :round 10}\n\n\n\n\n[:sex :embarked]\n\n0.7863327620135847\n\n{:model-type :xgboost/classification, :round 10}\n\n\n\n\n[:sex]\n\n0.7863327620135847\n\n{:model-type :sklearn.classification/decision-tree-classifier}\n\n\n\n\n[:sex]\n\n0.7863327620135847\n\n{:model-type :scicloj.ml.tribuo/classification,\n :tribuo-components\n [{:name \"random-forest\",\n :type \"org.tribuo.classification.dtree.CARTClassificationTrainer\",\n :properties\n {:maxDepth \"8\",\n :useRandomSplitPoints \"false\",\n :fractionFeaturesInSplit \"0.5\"}}],\n :tribuo-trainer-name \"random-forest\"}\n\n\n\n\n[:sex :pclass :embarked]\n\n0.7852091665079668\n\n{:model-type :scicloj.ml.tribuo/classification,\n :tribuo-components\n [{:name \"logistic\",\n :type \"org.tribuo.classification.sgd.linear.LinearSGDTrainer\"}],\n :tribuo-trainer-name \"logistic\"}\n\n\n\n\n[:sex :pclass]\n\n0.775096806957405\n\n{:model-type :sklearn.classification/random-forest-classifier}\n\n\n\n\n[:sex :pclass :embarked]\n\n0.7750777629657843\n\n{:model-type :sklearn.classification/logistic-regression}", "crumbs": [ "Tutorials", "8  AutoML using metamorph pipelines" From 8cc207728ec00cfd2d118a1f3efce82e901b10f5 Mon Sep 17 00:00:00 2001 From: Carsten Behring Date: Sun, 29 Sep 2024 16:19:09 +0200 Subject: [PATCH 2/3] Aktualisieren von automl.clj fixed author name --- notebooks/noj_book/automl.clj | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/notebooks/noj_book/automl.clj b/notebooks/noj_book/automl.clj index a809441..a49c918 100644 --- a/notebooks/noj_book/automl.clj +++ b/notebooks/noj_book/automl.clj @@ -1,6 +1,6 @@ ;; # AutoML using metamorph pipelines -;; author: Carsten Behring +;; Author: Carsten Behring ;; In this tutorial we see how to use `metamorph.ml` to perform automatic machine learning. ;; With AutoML we mean to try lots of different models and hyper parameters and rely on automatic From 4b672511b77bd62726e5fd1c95439de2e53f01ed Mon Sep 17 00:00:00 2001 From: behrica Date: Sun, 29 Sep 2024 14:24:54 +0000 Subject: [PATCH 3/3] Apply automatic changes --- docs/noj_book.automl.html | 186 +++++++++--------- ...book.visualizing_correlation_matrices.html | 2 +- docs/search.json | 8 +- 3 files changed, 98 insertions(+), 98 deletions(-) diff --git a/docs/noj_book.automl.html b/docs/noj_book.automl.html index 53e82ec..1322efd 100644 --- a/docs/noj_book.automl.html +++ b/docs/noj_book.automl.html @@ -279,7 +279,7 @@

    8  -

    author: Carsten Behring

    +

    Author: Carsten Behring

    In this tutorial we see how to use metamorph.ml to perform automatic machine learning. With AutoML we mean to try lots of different models and hyper parameters and rely on automatic validation to pick the best performing model automatically.

    (ns noj-book.automl
    @@ -328,7 +328,7 @@ 

    my-pipeline

    -
    #object[clojure.core$partial$fn__5927 0x8e0742a "clojure.core$partial$fn__5927@8e0742a"]
    +
    #object[clojure.core$partial$fn__5927 0x7597bcc7 "clojure.core$partial$fn__5927@7597bcc7"]

    This function is metamorph compliant, so it takes a map (my-pipeline {}) and returns a map.

    But this map cannot be “arbitrary”, it need to adhere to the metamorph conventions.

    @@ -366,22 +366,22 @@

    -1.0 -1.0 -2.0 -1.0 +0.0 +3.0 +0.0 +0.0 1.0 -1.0 +3.0 +2.0 0.0 -1.0 0.0 2.0 0.0 -1.0 +0.0 0.0 @@ -391,36 +391,36 @@

    1.0 -1.0 2.0 +0.0 1.0 -1.0 -2.0 0.0 -1.0 +3.0 +0.0 +0.0 0.0 -2.0 +3.0 0.0 0.0 -1.0 -3.0 0.0 +3.0 +2.0 0.0 0.0 -1.0 +3.0 0.0 0.0 -0.0 +1.0 1.0 0.0 1.0 @@ -432,10 +432,10 @@

    ... -1.0 -2.0 0.0 -1.0 +3.0 +0.0 +0.0 0.0 @@ -446,56 +446,56 @@

    0.0 3.0 -2.0 +1.0 0.0 -0.0 1.0 -2.0 +3.0 +0.0 0.0 -0.0 +1.0 3.0 -0.0 -0.0 +1.0 +1.0 0.0 3.0 0.0 -0.0 +1.0 1.0 -3.0 +1.0 2.0 -0.0 +1.0 0.0 3.0 -0.0 +1.0 0.0 -0.0 1.0 -0.0 -0.0 +1.0 +2.0 +1.0 1.0 1.0 -2.0 +0.0 1.0 -0.0 1.0 -2.0 -0.0 +3.0 +1.0 +1.0 @@ -504,7 +504,7 @@

    :metamorph/mode :fit
    #uuid "fd1ac3f3-7f5c-4e69-86b1-4655f1310d53" {:model-data {:majority-class 0.0, :distinct-labels (1.0 0.0)}, :options {:model-type :metamorph.ml/dummy-classifier}, :id #uuid "d19b7a20-e19e-4d60-a5e8-4f363c909c38", :feature-columns [:sex :pclass :embarked], :target-columns [:survived], :target-categorical-maps {:survived #tech.v3.dataset.categorical.CategoricalMap{:lookup-table {"no" 0, "yes" 1}, :src-column :survived, :result-datatype :float64}}, :scicloj.metamorph.ml/unsupervised? nil}

    }

    +
    :metamorph/mode :fit
    #uuid "fc7de623-27ba-49b7-b044-8c5b6c5a0271" {:model-data {:majority-class 1.0, :distinct-labels (0.0 1.0)}, :options {:model-type :metamorph.ml/dummy-classifier}, :id #uuid "636ad0f4-4c49-4d42-a887-f947b9a288c5", :feature-columns [:sex :pclass :embarked], :target-columns [:survived], :target-categorical-maps {:survived #tech.v3.dataset.categorical.CategoricalMap{:lookup-table {"no" 0, "yes" 1}, :src-column :survived, :result-datatype :float64}}, :scicloj.metamorph.ml/unsupervised? nil}

    }

    The ctx contains lots of information, so I only show its top level keys

    (keys ctx-after-train)
    @@ -512,7 +512,7 @@

    (:metamorph/data
      :metamorph/mode
    - #uuid "fd1ac3f3-7f5c-4e69-86b1-4655f1310d53")
    + #uuid "fc7de623-27ba-49b7-b044-8c5b6c5a0271")

    This context map has the “data”, the “mode” and an UUID for each operation (we had only one in this pipeline)

    @@ -530,22 +530,22 @@

    -1.0 -1.0 -2.0 -1.0 +0.0 +3.0 +0.0 +0.0 1.0 -1.0 +3.0 +2.0 0.0 -1.0 0.0 2.0 0.0 -1.0 +0.0 0.0 @@ -555,36 +555,36 @@

    1.0 -1.0 2.0 +0.0 1.0 -1.0 -2.0 0.0 -1.0 +3.0 +0.0 +0.0 0.0 -2.0 +3.0 0.0 0.0 -1.0 -3.0 0.0 +3.0 +2.0 0.0 0.0 -1.0 +3.0 0.0 0.0 -0.0 +1.0 1.0 0.0 1.0 @@ -596,10 +596,10 @@

    ... -1.0 -2.0 0.0 -1.0 +3.0 +0.0 +0.0 0.0 @@ -610,63 +610,63 @@

    0.0 3.0 -2.0 +1.0 0.0 -0.0 1.0 -2.0 +3.0 +0.0 0.0 -0.0 +1.0 3.0 -0.0 -0.0 +1.0 +1.0 0.0 3.0 0.0 -0.0 +1.0 1.0 -3.0 +1.0 2.0 -0.0 +1.0 0.0 3.0 -0.0 +1.0 0.0 -0.0 1.0 -0.0 -0.0 +1.0 +2.0 +1.0 1.0 1.0 -2.0 +0.0 1.0 -0.0 1.0 -2.0 -0.0 +3.0 +1.0 +1.0

    :fit
    -
    {:model-data {:majority-class 0.0, :distinct-labels (1.0 0.0)},
    +
    {:model-data {:majority-class 1.0, :distinct-labels (0.0 1.0)},
      :options {:model-type :metamorph.ml/dummy-classifier},
    - :id #uuid "d19b7a20-e19e-4d60-a5e8-4f363c909c38",
    + :id #uuid "636ad0f4-4c49-4d42-a887-f947b9a288c5",
      :feature-columns [:sex :pclass :embarked],
      :target-columns [:survived],
      :target-categorical-maps
    @@ -690,7 +690,7 @@ 

    (:metamorph/data
      :metamorph/mode
    - #uuid "fd1ac3f3-7f5c-4e69-86b1-4655f1310d53")
    + #uuid "fc7de623-27ba-49b7-b044-8c5b6c5a0271")

    For the dummy-model we do not see a trained-model, but it “communicates” the majority class from the train data to use it for prediction. So the dummy-model has ‘learned’ the majority class from its training data.

    So we can get prediction result out of the ctx:

    @@ -700,7 +700,7 @@

    #tech.v3.dataset.column<float64>[178]
     :survived
    -[0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000...]
    +[1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000...]

    This works as long as all operations of the pipeline follow the metamorph convention (we can create such compliant functions, out of normal dataset->dataset functions, as we will see)

    my-pipeline represents therefore a not yet executed model training / prediction flow. It can be freely moved around and applied to datasets when needed.

    @@ -723,7 +723,7 @@

    (:metamorph/data
      :metamorph/mode
    - #uuid "5d293179-218e-41d6-b725-8d06e1cc4219")
    + #uuid "03fc4a16-6977-4018-adb6-6173f6cf13fd")

    To show the power of pipelines, I start with doing the simplest possible pipeline, and expand then on it.

    We can already chain train and test with usual functions:

    @@ -736,7 +736,7 @@

    #tech.v3.dataset.column<float64>[178]
     :survived
    -[0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000...]
    +[1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000...]

    the same with pipelines

    @@ -752,7 +752,7 @@

    #tech.v3.dataset.column<float64>[178]
     :survived
    -[0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000...]
    +[1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000...]

    @@ -788,19 +788,19 @@

    (mm/pipeline ops-1)
    -
    #object[clojure.core$partial$fn__5927 0x1244c8b0 "clojure.core$partial$fn__5927@1244c8b0"]
    +
    #object[clojure.core$partial$fn__5927 0x4ab1e66e "clojure.core$partial$fn__5927@4ab1e66e"]
    (mm/pipeline ops-2)
    -
    #object[clojure.core$partial$fn__5927 0x7697e6e5 "clojure.core$partial$fn__5927@7697e6e5"]
    +
    #object[clojure.core$partial$fn__5927 0x7eee8280 "clojure.core$partial$fn__5927@7eee8280"]
    (mm/pipeline ops-3)
    -
    #object[clojure.core$partial$fn__5927 0x79747166 "clojure.core$partial$fn__5927@79747166"]
    +
    #object[clojure.core$partial$fn__5927 0x7bdb1918 "clojure.core$partial$fn__5927@7bdb1918"]

    All three can be called as function taking a dataset iwrapped in a ctx

    Pipeline as data is as well supported

    @@ -811,7 +811,7 @@

    (mm/->pipeline op-spec)
    -
    #object[clojure.core$partial$fn__5927 0x5d95154e "clojure.core$partial$fn__5927@5d95154e"]
    +
    #object[clojure.core$partial$fn__5927 0x79747166 "clojure.core$partial$fn__5927@79747166"]

    Creating these functions does not yet execute anything, they are functions which can be executed against a context as part of a metamorph pipeline. Executions are triggered like this:

    @@ -1663,20 +1663,20 @@

    -
    [:sex :pclass]
    +
    [:sex :pclass :embarked]
    -0.775096806957405 +0.7750777629657843
    -
    {:model-type :sklearn.classification/random-forest-classifier}
    +
    {:model-type :sklearn.classification/logistic-regression}
    -
    [:sex :pclass :embarked]
    +
    [:sex :pclass]
    -0.7750777629657843 +0.773973211451787
    -
    {:model-type :sklearn.classification/logistic-regression}
    +
    {:model-type :sklearn.classification/random-forest-classifier}
    diff --git a/docs/noj_book.visualizing_correlation_matrices.html b/docs/noj_book.visualizing_correlation_matrices.html index d4b925f..0d7f00a 100644 --- a/docs/noj_book.visualizing_correlation_matrices.html +++ b/docs/noj_book.visualizing_correlation_matrices.html @@ -546,7 +546,7 @@

    Note the slider control and the tooltips.

    Here is an example with an actual correlation matrix.

    diff --git a/docs/search.json b/docs/search.json index cbdd0f6..69f5c45 100644 --- a/docs/search.json +++ b/docs/search.json @@ -200,7 +200,7 @@ "href": "noj_book.automl.html#the-metamorph-pipeline-abstraction", "title": "8  AutoML using metamorph pipelines", "section": "", - "text": "(require '[scicloj.metamorph.ml :as ml]\n '[scicloj.metamorph.core :as mm]\n '[tablecloth.api :as tc])\n\n\n\n(def titanic ml-basic/numeric-titanic-data)\n\n\n\n(def splits (first (tc/split->seq titanic)))\n\n\n(def train-ds (:train splits))\n\n\n(def test-ds (:test splits))\n\n\n\n\n(def my-pipeline\n (mm/pipeline\n (ml/model {:model-type :metamorph.ml/dummy-classifier})))\n\n\n\nmy-pipeline\n\n\n#object[clojure.core$partial$fn__5927 0x8e0742a \"clojure.core$partial$fn__5927@8e0742a\"]\n\n\n\n\n\n(def ctx-after-train\n (my-pipeline {:metamorph/data train-ds\n :metamorph/mode :fit}))\n\n\nctx-after-train\n\n{\n\n\n\n\n\n\n\n\n:metamorph/data\n\n\n\nGroup: 0 [711 4]:\n\n\n\n:sex\n:pclass\n:embarked\n:survived\n\n\n\n\n1.0\n1.0\n2.0\n1.0\n\n\n1.0\n1.0\n0.0\n1.0\n\n\n0.0\n2.0\n0.0\n1.0\n\n\n0.0\n3.0\n0.0\n0.0\n\n\n1.0\n1.0\n2.0\n1.0\n\n\n1.0\n2.0\n0.0\n1.0\n\n\n0.0\n2.0\n0.0\n0.0\n\n\n1.0\n3.0\n0.0\n0.0\n\n\n0.0\n1.0\n0.0\n0.0\n\n\n0.0\n1.0\n0.0\n1.0\n\n\n...\n...\n...\n...\n\n\n1.0\n2.0\n0.0\n1.0\n\n\n0.0\n2.0\n0.0\n0.0\n\n\n0.0\n3.0\n2.0\n0.0\n\n\n0.0\n1.0\n2.0\n0.0\n\n\n0.0\n3.0\n0.0\n0.0\n\n\n0.0\n3.0\n0.0\n0.0\n\n\n1.0\n3.0\n2.0\n0.0\n\n\n0.0\n3.0\n0.0\n0.0\n\n\n0.0\n1.0\n0.0\n0.0\n\n\n1.0\n1.0\n2.0\n1.0\n\n\n0.0\n1.0\n2.0\n0.0\n\n\n\n\n\n\n\n\n:metamorph/mode :fit#uuid \"fd1ac3f3-7f5c-4e69-86b1-4655f1310d53\" {:model-data {:majority-class 0.0, :distinct-labels (1.0 0.0)}, :options {:model-type :metamorph.ml/dummy-classifier}, :id #uuid \"d19b7a20-e19e-4d60-a5e8-4f363c909c38\", :feature-columns [:sex :pclass :embarked], :target-columns [:survived], :target-categorical-maps {:survived #tech.v3.dataset.categorical.CategoricalMap{:lookup-table {\"no\" 0, \"yes\" 1}, :src-column :survived, :result-datatype :float64}}, :scicloj.metamorph.ml/unsupervised? nil}}\n\n\n(keys ctx-after-train)\n\n\n(:metamorph/data\n :metamorph/mode\n #uuid \"fd1ac3f3-7f5c-4e69-86b1-4655f1310d53\")\n\n\n\n(vals ctx-after-train)\n\n(Group: 0 [711 4]:\n\n\n\n:sex\n:pclass\n:embarked\n:survived\n\n\n\n\n1.0\n1.0\n2.0\n1.0\n\n\n1.0\n1.0\n0.0\n1.0\n\n\n0.0\n2.0\n0.0\n1.0\n\n\n0.0\n3.0\n0.0\n0.0\n\n\n1.0\n1.0\n2.0\n1.0\n\n\n1.0\n2.0\n0.0\n1.0\n\n\n0.0\n2.0\n0.0\n0.0\n\n\n1.0\n3.0\n0.0\n0.0\n\n\n0.0\n1.0\n0.0\n0.0\n\n\n0.0\n1.0\n0.0\n1.0\n\n\n...\n...\n...\n...\n\n\n1.0\n2.0\n0.0\n1.0\n\n\n0.0\n2.0\n0.0\n0.0\n\n\n0.0\n3.0\n2.0\n0.0\n\n\n0.0\n1.0\n2.0\n0.0\n\n\n0.0\n3.0\n0.0\n0.0\n\n\n0.0\n3.0\n0.0\n0.0\n\n\n1.0\n3.0\n2.0\n0.0\n\n\n0.0\n3.0\n0.0\n0.0\n\n\n0.0\n1.0\n0.0\n0.0\n\n\n1.0\n1.0\n2.0\n1.0\n\n\n0.0\n1.0\n2.0\n0.0\n\n\n\n:fit\n{:model-data {:majority-class 0.0, :distinct-labels (1.0 0.0)},\n :options {:model-type :metamorph.ml/dummy-classifier},\n :id #uuid \"d19b7a20-e19e-4d60-a5e8-4f363c909c38\",\n :feature-columns [:sex :pclass :embarked],\n :target-columns [:survived],\n :target-categorical-maps\n {:survived\n {:lookup-table {\"no\" 0, \"yes\" 1},\n :src-column :survived,\n :result-datatype :float64}},\n :scicloj.metamorph.ml/unsupervised? nil}\n)\n\n\n\n(def ctx-after-predict\n (my-pipeline (assoc ctx-after-train\n :metamorph/mode :transform\n :metamorph/data test-ds)))\n\n\n(keys ctx-after-predict)\n\n\n(:metamorph/data\n :metamorph/mode\n #uuid \"fd1ac3f3-7f5c-4e69-86b1-4655f1310d53\")\n\n\n\n\n(-> ctx-after-predict :metamorph/data :survived)\n\n\n#tech.v3.dataset.column<float64>[178]\n:survived\n[0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000...]", + "text": "(require '[scicloj.metamorph.ml :as ml]\n '[scicloj.metamorph.core :as mm]\n '[tablecloth.api :as tc])\n\n\n\n(def titanic ml-basic/numeric-titanic-data)\n\n\n\n(def splits (first (tc/split->seq titanic)))\n\n\n(def train-ds (:train splits))\n\n\n(def test-ds (:test splits))\n\n\n\n\n(def my-pipeline\n (mm/pipeline\n (ml/model {:model-type :metamorph.ml/dummy-classifier})))\n\n\n\nmy-pipeline\n\n\n#object[clojure.core$partial$fn__5927 0x7597bcc7 \"clojure.core$partial$fn__5927@7597bcc7\"]\n\n\n\n\n\n(def ctx-after-train\n (my-pipeline {:metamorph/data train-ds\n :metamorph/mode :fit}))\n\n\nctx-after-train\n\n{\n\n\n\n\n\n\n\n\n:metamorph/data\n\n\n\nGroup: 0 [711 4]:\n\n\n\n:sex\n:pclass\n:embarked\n:survived\n\n\n\n\n0.0\n3.0\n0.0\n0.0\n\n\n1.0\n3.0\n2.0\n0.0\n\n\n0.0\n2.0\n0.0\n0.0\n\n\n0.0\n3.0\n0.0\n0.0\n\n\n1.0\n2.0\n0.0\n1.0\n\n\n0.0\n3.0\n0.0\n0.0\n\n\n0.0\n3.0\n0.0\n0.0\n\n\n0.0\n3.0\n2.0\n0.0\n\n\n0.0\n3.0\n0.0\n0.0\n\n\n1.0\n1.0\n0.0\n1.0\n\n\n...\n...\n...\n...\n\n\n0.0\n3.0\n0.0\n0.0\n\n\n0.0\n2.0\n0.0\n0.0\n\n\n0.0\n3.0\n1.0\n0.0\n\n\n1.0\n3.0\n0.0\n0.0\n\n\n1.0\n3.0\n1.0\n1.0\n\n\n0.0\n3.0\n0.0\n1.0\n\n\n1.0\n1.0\n2.0\n1.0\n\n\n0.0\n3.0\n1.0\n0.0\n\n\n1.0\n1.0\n2.0\n1.0\n\n\n1.0\n1.0\n0.0\n1.0\n\n\n1.0\n3.0\n1.0\n1.0\n\n\n\n\n\n\n\n\n:metamorph/mode :fit#uuid \"fc7de623-27ba-49b7-b044-8c5b6c5a0271\" {:model-data {:majority-class 1.0, :distinct-labels (0.0 1.0)}, :options {:model-type :metamorph.ml/dummy-classifier}, :id #uuid \"636ad0f4-4c49-4d42-a887-f947b9a288c5\", :feature-columns [:sex :pclass :embarked], :target-columns [:survived], :target-categorical-maps {:survived #tech.v3.dataset.categorical.CategoricalMap{:lookup-table {\"no\" 0, \"yes\" 1}, :src-column :survived, :result-datatype :float64}}, :scicloj.metamorph.ml/unsupervised? nil}}\n\n\n(keys ctx-after-train)\n\n\n(:metamorph/data\n :metamorph/mode\n #uuid \"fc7de623-27ba-49b7-b044-8c5b6c5a0271\")\n\n\n\n(vals ctx-after-train)\n\n(Group: 0 [711 4]:\n\n\n\n:sex\n:pclass\n:embarked\n:survived\n\n\n\n\n0.0\n3.0\n0.0\n0.0\n\n\n1.0\n3.0\n2.0\n0.0\n\n\n0.0\n2.0\n0.0\n0.0\n\n\n0.0\n3.0\n0.0\n0.0\n\n\n1.0\n2.0\n0.0\n1.0\n\n\n0.0\n3.0\n0.0\n0.0\n\n\n0.0\n3.0\n0.0\n0.0\n\n\n0.0\n3.0\n2.0\n0.0\n\n\n0.0\n3.0\n0.0\n0.0\n\n\n1.0\n1.0\n0.0\n1.0\n\n\n...\n...\n...\n...\n\n\n0.0\n3.0\n0.0\n0.0\n\n\n0.0\n2.0\n0.0\n0.0\n\n\n0.0\n3.0\n1.0\n0.0\n\n\n1.0\n3.0\n0.0\n0.0\n\n\n1.0\n3.0\n1.0\n1.0\n\n\n0.0\n3.0\n0.0\n1.0\n\n\n1.0\n1.0\n2.0\n1.0\n\n\n0.0\n3.0\n1.0\n0.0\n\n\n1.0\n1.0\n2.0\n1.0\n\n\n1.0\n1.0\n0.0\n1.0\n\n\n1.0\n3.0\n1.0\n1.0\n\n\n\n:fit\n{:model-data {:majority-class 1.0, :distinct-labels (0.0 1.0)},\n :options {:model-type :metamorph.ml/dummy-classifier},\n :id #uuid \"636ad0f4-4c49-4d42-a887-f947b9a288c5\",\n :feature-columns [:sex :pclass :embarked],\n :target-columns [:survived],\n :target-categorical-maps\n {:survived\n {:lookup-table {\"no\" 0, \"yes\" 1},\n :src-column :survived,\n :result-datatype :float64}},\n :scicloj.metamorph.ml/unsupervised? nil}\n)\n\n\n\n(def ctx-after-predict\n (my-pipeline (assoc ctx-after-train\n :metamorph/mode :transform\n :metamorph/data test-ds)))\n\n\n(keys ctx-after-predict)\n\n\n(:metamorph/data\n :metamorph/mode\n #uuid \"fc7de623-27ba-49b7-b044-8c5b6c5a0271\")\n\n\n\n\n(-> ctx-after-predict :metamorph/data :survived)\n\n\n#tech.v3.dataset.column<float64>[178]\n:survived\n[1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000...]", "crumbs": [ "Tutorials", "8  AutoML using metamorph pipelines" @@ -211,7 +211,7 @@ "href": "noj_book.automl.html#use-metamorph-pipelines-to-do-model-training-with-higher-level-api", "title": "8  AutoML using metamorph pipelines", "section": "8.2 Use metamorph pipelines to do model training with higher level API", - "text": "8.2 Use metamorph pipelines to do model training with higher level API\nAs user of metamorph.ml we do not need to deal with this low-level details of how metamorph works, we have convenience functions which hide this.\nThe following code will do the same as train, but return a context object, which contains the trained model, so it will execute the pipeline, and not only create it.\nIt uses a convenience function mm/fit which generates compliant context maps internally and executes the pipeline as well.\nThe ctx acts a collector of everything “learned” during :fit, mainly the trained model, but it could be as well other information learned from the data during :fit and to be applied at :transform .\n\n(def train-ctx\n (mm/fit titanic\n (ml/model {:model-type :metamorph.ml/dummy-classifier})))\n\n(The dummy-classifier model does not have a lot of state, so there is little to see)\n\n(keys train-ctx)\n\n\n(:metamorph/data\n :metamorph/mode\n #uuid \"5d293179-218e-41d6-b725-8d06e1cc4219\")\n\nTo show the power of pipelines, I start with doing the simplest possible pipeline, and expand then on it.\nWe can already chain train and test with usual functions:\n\n(->>\n (ml/train train-ds {:model-type :metamorph.ml/dummy-classifier})\n (ml/predict test-ds)\n :survived)\n\n\n#tech.v3.dataset.column<float64>[178]\n:survived\n[0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000...]\n\nthe same with pipelines\n\n(def pipeline\n (mm/pipeline (ml/model {:model-type :metamorph.ml/dummy-classifier})))\n\n\n(->>\n (mm/fit-pipe train-ds pipeline)\n (mm/transform-pipe test-ds pipeline)\n :metamorph/data :survived)\n\n\n#tech.v3.dataset.column<float64>[178]\n:survived\n[0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000...]", + "text": "8.2 Use metamorph pipelines to do model training with higher level API\nAs user of metamorph.ml we do not need to deal with this low-level details of how metamorph works, we have convenience functions which hide this.\nThe following code will do the same as train, but return a context object, which contains the trained model, so it will execute the pipeline, and not only create it.\nIt uses a convenience function mm/fit which generates compliant context maps internally and executes the pipeline as well.\nThe ctx acts a collector of everything “learned” during :fit, mainly the trained model, but it could be as well other information learned from the data during :fit and to be applied at :transform .\n\n(def train-ctx\n (mm/fit titanic\n (ml/model {:model-type :metamorph.ml/dummy-classifier})))\n\n(The dummy-classifier model does not have a lot of state, so there is little to see)\n\n(keys train-ctx)\n\n\n(:metamorph/data\n :metamorph/mode\n #uuid \"03fc4a16-6977-4018-adb6-6173f6cf13fd\")\n\nTo show the power of pipelines, I start with doing the simplest possible pipeline, and expand then on it.\nWe can already chain train and test with usual functions:\n\n(->>\n (ml/train train-ds {:model-type :metamorph.ml/dummy-classifier})\n (ml/predict test-ds)\n :survived)\n\n\n#tech.v3.dataset.column<float64>[178]\n:survived\n[1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000...]\n\nthe same with pipelines\n\n(def pipeline\n (mm/pipeline (ml/model {:model-type :metamorph.ml/dummy-classifier})))\n\n\n(->>\n (mm/fit-pipe train-ds pipeline)\n (mm/transform-pipe test-ds pipeline)\n :metamorph/data :survived)\n\n\n#tech.v3.dataset.column<float64>[178]\n:survived\n[1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000...]", "crumbs": [ "Tutorials", "8  AutoML using metamorph pipelines" @@ -222,7 +222,7 @@ "href": "noj_book.automl.html#create-metamorph-compliant-functions", "title": "8  AutoML using metamorph pipelines", "section": "8.3 Create metamorph compliant functions", - "text": "8.3 Create metamorph compliant functions\nAs said before, a metamorph pipeline is composed of metamorph compliant functions / operations, which take as input and output the ctx. There are three ways to create those.\nThe following three expressions create the same metamorph compliant function\n\nimplementing a metamorph compliant function directly via anonymous function\n\n\n(def ops-1\n (fn [ctx]\n (assoc ctx :metamorph/data\n (tc/drop-columns (:metamorph/data ctx) [:embarked]))))\n\n\nusing mm/lift which does the same as 1.\n\n\n(def ops-2 (mm/lift tc/drop-columns [:embarked]))\n\n\nusing a name-space containing lifted functions\n\n\n(require '[tablecloth.pipeline])\n\n\n(def ops-3 (tablecloth.pipeline/drop-columns [:embarked]))\n\nAll three create the same pipeline op and can be used to make a pipeline\n\n(mm/pipeline ops-1)\n\n\n#object[clojure.core$partial$fn__5927 0x1244c8b0 \"clojure.core$partial$fn__5927@1244c8b0\"]\n\n\n(mm/pipeline ops-2)\n\n\n#object[clojure.core$partial$fn__5927 0x7697e6e5 \"clojure.core$partial$fn__5927@7697e6e5\"]\n\n\n(mm/pipeline ops-3)\n\n\n#object[clojure.core$partial$fn__5927 0x79747166 \"clojure.core$partial$fn__5927@79747166\"]\n\nAll three can be called as function taking a dataset iwrapped in a ctx\nPipeline as data is as well supported\n\n(def op-spec [[ml/model {:model-type :metamorph.ml/dummy-classifier}]])\n\n\n(mm/->pipeline op-spec)\n\n\n#object[clojure.core$partial$fn__5927 0x5d95154e \"clojure.core$partial$fn__5927@5d95154e\"]\n\nCreating these functions does not yet execute anything, they are functions which can be executed against a context as part of a metamorph pipeline. Executions are triggered like this:\n\n(ops-1 {:metamorph/data titanic})\n\n{\n\n\n\n\n\n\n\n\n:metamorph/data\n\n\n\n_unnamed [889 3]:\n\n\n\n:sex\n:pclass\n:survived\n\n\n\n\n0.0\n3.0\n0.0\n\n\n1.0\n1.0\n1.0\n\n\n1.0\n3.0\n1.0\n\n\n1.0\n1.0\n1.0\n\n\n0.0\n3.0\n0.0\n\n\n0.0\n3.0\n0.0\n\n\n0.0\n1.0\n0.0\n\n\n0.0\n3.0\n0.0\n\n\n1.0\n3.0\n1.0\n\n\n1.0\n2.0\n1.0\n\n\n...\n...\n...\n\n\n1.0\n2.0\n1.0\n\n\n0.0\n3.0\n0.0\n\n\n1.0\n3.0\n0.0\n\n\n0.0\n2.0\n0.0\n\n\n0.0\n3.0\n0.0\n\n\n1.0\n3.0\n0.0\n\n\n0.0\n2.0\n0.0\n\n\n1.0\n1.0\n1.0\n\n\n1.0\n3.0\n0.0\n\n\n0.0\n1.0\n1.0\n\n\n0.0\n3.0\n0.0\n\n\n\n\n\n\n\n\n}\n\n(ops-2 {:metamorph/data titanic})\n\n{\n\n\n\n\n\n\n\n\n:metamorph/data\n\n\n\n_unnamed [889 3]:\n\n\n\n:sex\n:pclass\n:survived\n\n\n\n\n0.0\n3.0\n0.0\n\n\n1.0\n1.0\n1.0\n\n\n1.0\n3.0\n1.0\n\n\n1.0\n1.0\n1.0\n\n\n0.0\n3.0\n0.0\n\n\n0.0\n3.0\n0.0\n\n\n0.0\n1.0\n0.0\n\n\n0.0\n3.0\n0.0\n\n\n1.0\n3.0\n1.0\n\n\n1.0\n2.0\n1.0\n\n\n...\n...\n...\n\n\n1.0\n2.0\n1.0\n\n\n0.0\n3.0\n0.0\n\n\n1.0\n3.0\n0.0\n\n\n0.0\n2.0\n0.0\n\n\n0.0\n3.0\n0.0\n\n\n1.0\n3.0\n0.0\n\n\n0.0\n2.0\n0.0\n\n\n1.0\n1.0\n1.0\n\n\n1.0\n3.0\n0.0\n\n\n0.0\n1.0\n1.0\n\n\n0.0\n3.0\n0.0\n\n\n\n\n\n\n\n\n}\n\n(ops-3 {:metamorph/data titanic})\n\n{\n\n\n\n\n\n\n\n\n:metamorph/data\n\n\n\n_unnamed [889 3]:\n\n\n\n:sex\n:pclass\n:survived\n\n\n\n\n0.0\n3.0\n0.0\n\n\n1.0\n1.0\n1.0\n\n\n1.0\n3.0\n1.0\n\n\n1.0\n1.0\n1.0\n\n\n0.0\n3.0\n0.0\n\n\n0.0\n3.0\n0.0\n\n\n0.0\n1.0\n0.0\n\n\n0.0\n3.0\n0.0\n\n\n1.0\n3.0\n1.0\n\n\n1.0\n2.0\n1.0\n\n\n...\n...\n...\n\n\n1.0\n2.0\n1.0\n\n\n0.0\n3.0\n0.0\n\n\n1.0\n3.0\n0.0\n\n\n0.0\n2.0\n0.0\n\n\n0.0\n3.0\n0.0\n\n\n1.0\n3.0\n0.0\n\n\n0.0\n2.0\n0.0\n\n\n1.0\n1.0\n1.0\n\n\n1.0\n3.0\n0.0\n\n\n0.0\n1.0\n1.0\n\n\n0.0\n3.0\n0.0\n\n\n\n\n\n\n\n\n}\nThe mm/lift function transforms any dataset->dataset function into a ctx->ctx function, while using the metamorph convention, as required for metamorph pipeline operations\nFor convenience tablecloth contains a ns where all dataset->dataset functions are lifted into ctx->ctx operations, so can be added to pipelines directly without using lift.\nSo a metamorph pipeline can encapsulate arbitrary transformation of a dataset in the 2 modes. They can be “stateless” (only chaining the dataset, such as drop-columns) or “state-full”, so they store data in the ctx during :fit and can use it in :transform. In the pipeline above, the trained model is stored in this way.\nThis state is not stored globally, but inside the pipeline so this makes pipeline execution “isolated”.\nSo now we can add more operations to the pipeline, and nothing else changes, for example drop columns.", + "text": "8.3 Create metamorph compliant functions\nAs said before, a metamorph pipeline is composed of metamorph compliant functions / operations, which take as input and output the ctx. There are three ways to create those.\nThe following three expressions create the same metamorph compliant function\n\nimplementing a metamorph compliant function directly via anonymous function\n\n\n(def ops-1\n (fn [ctx]\n (assoc ctx :metamorph/data\n (tc/drop-columns (:metamorph/data ctx) [:embarked]))))\n\n\nusing mm/lift which does the same as 1.\n\n\n(def ops-2 (mm/lift tc/drop-columns [:embarked]))\n\n\nusing a name-space containing lifted functions\n\n\n(require '[tablecloth.pipeline])\n\n\n(def ops-3 (tablecloth.pipeline/drop-columns [:embarked]))\n\nAll three create the same pipeline op and can be used to make a pipeline\n\n(mm/pipeline ops-1)\n\n\n#object[clojure.core$partial$fn__5927 0x4ab1e66e \"clojure.core$partial$fn__5927@4ab1e66e\"]\n\n\n(mm/pipeline ops-2)\n\n\n#object[clojure.core$partial$fn__5927 0x7eee8280 \"clojure.core$partial$fn__5927@7eee8280\"]\n\n\n(mm/pipeline ops-3)\n\n\n#object[clojure.core$partial$fn__5927 0x7bdb1918 \"clojure.core$partial$fn__5927@7bdb1918\"]\n\nAll three can be called as function taking a dataset iwrapped in a ctx\nPipeline as data is as well supported\n\n(def op-spec [[ml/model {:model-type :metamorph.ml/dummy-classifier}]])\n\n\n(mm/->pipeline op-spec)\n\n\n#object[clojure.core$partial$fn__5927 0x79747166 \"clojure.core$partial$fn__5927@79747166\"]\n\nCreating these functions does not yet execute anything, they are functions which can be executed against a context as part of a metamorph pipeline. Executions are triggered like this:\n\n(ops-1 {:metamorph/data titanic})\n\n{\n\n\n\n\n\n\n\n\n:metamorph/data\n\n\n\n_unnamed [889 3]:\n\n\n\n:sex\n:pclass\n:survived\n\n\n\n\n0.0\n3.0\n0.0\n\n\n1.0\n1.0\n1.0\n\n\n1.0\n3.0\n1.0\n\n\n1.0\n1.0\n1.0\n\n\n0.0\n3.0\n0.0\n\n\n0.0\n3.0\n0.0\n\n\n0.0\n1.0\n0.0\n\n\n0.0\n3.0\n0.0\n\n\n1.0\n3.0\n1.0\n\n\n1.0\n2.0\n1.0\n\n\n...\n...\n...\n\n\n1.0\n2.0\n1.0\n\n\n0.0\n3.0\n0.0\n\n\n1.0\n3.0\n0.0\n\n\n0.0\n2.0\n0.0\n\n\n0.0\n3.0\n0.0\n\n\n1.0\n3.0\n0.0\n\n\n0.0\n2.0\n0.0\n\n\n1.0\n1.0\n1.0\n\n\n1.0\n3.0\n0.0\n\n\n0.0\n1.0\n1.0\n\n\n0.0\n3.0\n0.0\n\n\n\n\n\n\n\n\n}\n\n(ops-2 {:metamorph/data titanic})\n\n{\n\n\n\n\n\n\n\n\n:metamorph/data\n\n\n\n_unnamed [889 3]:\n\n\n\n:sex\n:pclass\n:survived\n\n\n\n\n0.0\n3.0\n0.0\n\n\n1.0\n1.0\n1.0\n\n\n1.0\n3.0\n1.0\n\n\n1.0\n1.0\n1.0\n\n\n0.0\n3.0\n0.0\n\n\n0.0\n3.0\n0.0\n\n\n0.0\n1.0\n0.0\n\n\n0.0\n3.0\n0.0\n\n\n1.0\n3.0\n1.0\n\n\n1.0\n2.0\n1.0\n\n\n...\n...\n...\n\n\n1.0\n2.0\n1.0\n\n\n0.0\n3.0\n0.0\n\n\n1.0\n3.0\n0.0\n\n\n0.0\n2.0\n0.0\n\n\n0.0\n3.0\n0.0\n\n\n1.0\n3.0\n0.0\n\n\n0.0\n2.0\n0.0\n\n\n1.0\n1.0\n1.0\n\n\n1.0\n3.0\n0.0\n\n\n0.0\n1.0\n1.0\n\n\n0.0\n3.0\n0.0\n\n\n\n\n\n\n\n\n}\n\n(ops-3 {:metamorph/data titanic})\n\n{\n\n\n\n\n\n\n\n\n:metamorph/data\n\n\n\n_unnamed [889 3]:\n\n\n\n:sex\n:pclass\n:survived\n\n\n\n\n0.0\n3.0\n0.0\n\n\n1.0\n1.0\n1.0\n\n\n1.0\n3.0\n1.0\n\n\n1.0\n1.0\n1.0\n\n\n0.0\n3.0\n0.0\n\n\n0.0\n3.0\n0.0\n\n\n0.0\n1.0\n0.0\n\n\n0.0\n3.0\n0.0\n\n\n1.0\n3.0\n1.0\n\n\n1.0\n2.0\n1.0\n\n\n...\n...\n...\n\n\n1.0\n2.0\n1.0\n\n\n0.0\n3.0\n0.0\n\n\n1.0\n3.0\n0.0\n\n\n0.0\n2.0\n0.0\n\n\n0.0\n3.0\n0.0\n\n\n1.0\n3.0\n0.0\n\n\n0.0\n2.0\n0.0\n\n\n1.0\n1.0\n1.0\n\n\n1.0\n3.0\n0.0\n\n\n0.0\n1.0\n1.0\n\n\n0.0\n3.0\n0.0\n\n\n\n\n\n\n\n\n}\nThe mm/lift function transforms any dataset->dataset function into a ctx->ctx function, while using the metamorph convention, as required for metamorph pipeline operations\nFor convenience tablecloth contains a ns where all dataset->dataset functions are lifted into ctx->ctx operations, so can be added to pipelines directly without using lift.\nSo a metamorph pipeline can encapsulate arbitrary transformation of a dataset in the 2 modes. They can be “stateless” (only chaining the dataset, such as drop-columns) or “state-full”, so they store data in the ctx during :fit and can use it in :transform. In the pipeline above, the trained model is stored in this way.\nThis state is not stored globally, but inside the pipeline so this makes pipeline execution “isolated”.\nSo now we can add more operations to the pipeline, and nothing else changes, for example drop columns.", "crumbs": [ "Tutorials", "8  AutoML using metamorph pipelines" @@ -244,7 +244,7 @@ "href": "noj_book.automl.html#finding-the-best-model-automatically", "title": "8  AutoML using metamorph pipelines", "section": "8.5 Finding the best model automatically", - "text": "8.5 Finding the best model automatically\nThe advantage of the pipelines is even more visible, if we want to have configurable pipelines, and do a grid search to find optimal settings.\nthe following will find the best model across:\n\n4 different model classes\n6 different selections of used features\nk-cross validate this with different test / train splits\n\n\n(defn make-pipe-fn [model-spec features]\n (mm/pipeline\n ;; store the used features in ctx, so we can retrieve them at the end\n (fn [ctx]\n (assoc ctx :used-features features))\n (mm/lift tc/select-columns (conj features :survived))\n {:metamorph/id :model} (ml/model model-spec)))\n\nCreate a 5-K cross validation split of the data:\n\n(def titanic-k-fold (tc/split->seq ml-basic/numeric-titanic-data :kfold {:seed 12345}))\n\n\n(-> titanic-k-fold count)\n\n\n5\n\nThe list of the model types we want to try:\n\n(def models [{ :model-type :xgboost/classification\n :round 10}\n {:model-type :sklearn.classification/decision-tree-classifier}\n {:model-type :sklearn.classification/logistic-regression}\n {:model-type :sklearn.classification/random-forest-classifier}\n {:model-type :metamorph.ml/dummy-classifier}\n {:model-type :scicloj.ml.tribuo/classification\n :tribuo-components [{:name \"logistic\"\n :type \"org.tribuo.classification.sgd.linear.LinearSGDTrainer\"}]\n :tribuo-trainer-name \"logistic\"}\n {:model-type :scicloj.ml.tribuo/classification\n :tribuo-components [{:name \"random-forest\"\n :type \"org.tribuo.classification.dtree.CARTClassificationTrainer\"\n :properties {:maxDepth \"8\"\n :useRandomSplitPoints \"false\"\n :fractionFeaturesInSplit \"0.5\"}}]\n :tribuo-trainer-name \"random-forest\"}])\n\nThis uses models from Smile and Tribuo, but could be any metamorph.ml compliant model ( library sklearn-clj wraps all python sklearn models, for example)\nThe list of feature combinations to try for each model:\n\n(def feature-combinations\n [[:sex :pclass :embarked]\n [:sex]\n [:pclass :embarked]\n [:embarked]\n [:sex :embarked]\n [:sex :pclass]])\n\ngenerate 24 pipeline functions:\n\n(def pipe-fns\n (for [model models\n feature-combination feature-combinations]\n (make-pipe-fn model feature-combination)))\n\n\n(count pipe-fns)\n\n\n42\n\nExecute all pipelines for all splits in the cross-validations and return best model by classification-accuracy\n\n(def evaluation-results\n (ml/evaluate-pipelines\n pipe-fns\n titanic-k-fold\n loss/classification-accuracy\n :accuracy))\n\nBy default it returns the best mode only\n\n(make-results-ds evaluation-results)\n\n\n_unnamed [1 3]:\n\n\n\n\n\n\n\n\n:used-features\n:mean-accuracy\n:options\n\n\n\n\n[:sex :pclass :embarked]\n0.81107726\n{:model-type :scicloj.ml.tribuo/classification,\n\n\n\n\n:tribuo-components\n\n\n\n\n[{:name random-forest,\n\n\n\n\n:type org.tribuo.classification.dtree.CARTClassificationTrainer,\n\n\n\n\n:properties\n\n\n\n\n{:maxDepth 8,\n\n\n\n\n:useRandomSplitPoints false,\n\n\n\n\n:fractionFeaturesInSplit 0.5}}],\n\n\n\n\n:tribuo-trainer-name random-forest}\n\n\n\n\nThe key observation is here, that the metamorph pipelines allow to not only grid-search over the model hyper-parameters, but as well over arbitrary pipeline variations, like which features to include. Both get handled in the same way.\nWe can get all results as well:\n\n(def evaluation-results-all\n (ml/evaluate-pipelines\n pipe-fns\n titanic-k-fold\n loss/classification-accuracy\n :accuracy\n {:map-fn :map\n :return-best-crossvalidation-only false\n :return-best-pipeline-only false}))\n\nIn total it creates and evaluates 4 models * 6 feature configurations * 5 CV = 120 models\n\n(-> evaluation-results-all flatten count)\n\n\n210\n\nWe can find the best as well by hand, it’s the first from the list, when sorted by accuracy.\n\n(-> (make-results-ds evaluation-results-all)\n (tc/unique-by)\n (tc/order-by [:mean-accuracy] :desc)\n (tc/head 20)\n (kind/table))\n\n\n\n\n\n\n\n\n\n\n\nused-features\nmean-accuracy\noptions\n\n\n\n\n\n[:sex :pclass :embarked]\n\n0.8110772551260077\n\n{:model-type :sklearn.classification/random-forest-classifier}\n\n\n\n\n[:sex :pclass :embarked]\n\n0.8110772551260077\n\n{:model-type :sklearn.classification/decision-tree-classifier}\n\n\n\n\n[:sex :pclass :embarked]\n\n0.8110772551260077\n\n{:model-type :xgboost/classification, :round 10}\n\n\n\n\n[:sex :pclass :embarked]\n\n0.8110772551260077\n\n{:model-type :scicloj.ml.tribuo/classification,\n :tribuo-components\n [{:name \"random-forest\",\n :type \"org.tribuo.classification.dtree.CARTClassificationTrainer\",\n :properties\n {:maxDepth \"8\",\n :useRandomSplitPoints \"false\",\n :fractionFeaturesInSplit \"0.5\"}}],\n :tribuo-trainer-name \"random-forest\"}\n\n\n\n\n[:sex :pclass]\n\n0.7863327620135847\n\n{:model-type :scicloj.ml.tribuo/classification,\n :tribuo-components\n [{:name \"logistic\",\n :type \"org.tribuo.classification.sgd.linear.LinearSGDTrainer\"}],\n :tribuo-trainer-name \"logistic\"}\n\n\n\n\n[:sex :embarked]\n\n0.7863327620135847\n\n{:model-type :scicloj.ml.tribuo/classification,\n :tribuo-components\n [{:name \"logistic\",\n :type \"org.tribuo.classification.sgd.linear.LinearSGDTrainer\"}],\n :tribuo-trainer-name \"logistic\"}\n\n\n\n\n[:sex]\n\n0.7863327620135847\n\n{:model-type :scicloj.ml.tribuo/classification,\n :tribuo-components\n [{:name \"logistic\",\n :type \"org.tribuo.classification.sgd.linear.LinearSGDTrainer\"}],\n :tribuo-trainer-name \"logistic\"}\n\n\n\n\n[:sex :embarked]\n\n0.7863327620135847\n\n{:model-type :sklearn.classification/random-forest-classifier}\n\n\n\n\n[:sex]\n\n0.7863327620135847\n\n{:model-type :sklearn.classification/random-forest-classifier}\n\n\n\n\n[:sex :pclass]\n\n0.7863327620135847\n\n{:model-type :sklearn.classification/logistic-regression}\n\n\n\n\n[:sex :embarked]\n\n0.7863327620135847\n\n{:model-type :sklearn.classification/logistic-regression}\n\n\n\n\n[:sex]\n\n0.7863327620135847\n\n{:model-type :sklearn.classification/logistic-regression}\n\n\n\n\n[:sex :embarked]\n\n0.7863327620135847\n\n{:model-type :sklearn.classification/decision-tree-classifier}\n\n\n\n\n[:sex]\n\n0.7863327620135847\n\n{:model-type :xgboost/classification, :round 10}\n\n\n\n\n[:sex :embarked]\n\n0.7863327620135847\n\n{:model-type :xgboost/classification, :round 10}\n\n\n\n\n[:sex]\n\n0.7863327620135847\n\n{:model-type :sklearn.classification/decision-tree-classifier}\n\n\n\n\n[:sex]\n\n0.7863327620135847\n\n{:model-type :scicloj.ml.tribuo/classification,\n :tribuo-components\n [{:name \"random-forest\",\n :type \"org.tribuo.classification.dtree.CARTClassificationTrainer\",\n :properties\n {:maxDepth \"8\",\n :useRandomSplitPoints \"false\",\n :fractionFeaturesInSplit \"0.5\"}}],\n :tribuo-trainer-name \"random-forest\"}\n\n\n\n\n[:sex :pclass :embarked]\n\n0.7852091665079668\n\n{:model-type :scicloj.ml.tribuo/classification,\n :tribuo-components\n [{:name \"logistic\",\n :type \"org.tribuo.classification.sgd.linear.LinearSGDTrainer\"}],\n :tribuo-trainer-name \"logistic\"}\n\n\n\n\n[:sex :pclass]\n\n0.775096806957405\n\n{:model-type :sklearn.classification/random-forest-classifier}\n\n\n\n\n[:sex :pclass :embarked]\n\n0.7750777629657843\n\n{:model-type :sklearn.classification/logistic-regression}", + "text": "8.5 Finding the best model automatically\nThe advantage of the pipelines is even more visible, if we want to have configurable pipelines, and do a grid search to find optimal settings.\nthe following will find the best model across:\n\n4 different model classes\n6 different selections of used features\nk-cross validate this with different test / train splits\n\n\n(defn make-pipe-fn [model-spec features]\n (mm/pipeline\n ;; store the used features in ctx, so we can retrieve them at the end\n (fn [ctx]\n (assoc ctx :used-features features))\n (mm/lift tc/select-columns (conj features :survived))\n {:metamorph/id :model} (ml/model model-spec)))\n\nCreate a 5-K cross validation split of the data:\n\n(def titanic-k-fold (tc/split->seq ml-basic/numeric-titanic-data :kfold {:seed 12345}))\n\n\n(-> titanic-k-fold count)\n\n\n5\n\nThe list of the model types we want to try:\n\n(def models [{ :model-type :xgboost/classification\n :round 10}\n {:model-type :sklearn.classification/decision-tree-classifier}\n {:model-type :sklearn.classification/logistic-regression}\n {:model-type :sklearn.classification/random-forest-classifier}\n {:model-type :metamorph.ml/dummy-classifier}\n {:model-type :scicloj.ml.tribuo/classification\n :tribuo-components [{:name \"logistic\"\n :type \"org.tribuo.classification.sgd.linear.LinearSGDTrainer\"}]\n :tribuo-trainer-name \"logistic\"}\n {:model-type :scicloj.ml.tribuo/classification\n :tribuo-components [{:name \"random-forest\"\n :type \"org.tribuo.classification.dtree.CARTClassificationTrainer\"\n :properties {:maxDepth \"8\"\n :useRandomSplitPoints \"false\"\n :fractionFeaturesInSplit \"0.5\"}}]\n :tribuo-trainer-name \"random-forest\"}])\n\nThis uses models from Smile and Tribuo, but could be any metamorph.ml compliant model ( library sklearn-clj wraps all python sklearn models, for example)\nThe list of feature combinations to try for each model:\n\n(def feature-combinations\n [[:sex :pclass :embarked]\n [:sex]\n [:pclass :embarked]\n [:embarked]\n [:sex :embarked]\n [:sex :pclass]])\n\ngenerate 24 pipeline functions:\n\n(def pipe-fns\n (for [model models\n feature-combination feature-combinations]\n (make-pipe-fn model feature-combination)))\n\n\n(count pipe-fns)\n\n\n42\n\nExecute all pipelines for all splits in the cross-validations and return best model by classification-accuracy\n\n(def evaluation-results\n (ml/evaluate-pipelines\n pipe-fns\n titanic-k-fold\n loss/classification-accuracy\n :accuracy))\n\nBy default it returns the best mode only\n\n(make-results-ds evaluation-results)\n\n\n_unnamed [1 3]:\n\n\n\n\n\n\n\n\n:used-features\n:mean-accuracy\n:options\n\n\n\n\n[:sex :pclass :embarked]\n0.81107726\n{:model-type :scicloj.ml.tribuo/classification,\n\n\n\n\n:tribuo-components\n\n\n\n\n[{:name random-forest,\n\n\n\n\n:type org.tribuo.classification.dtree.CARTClassificationTrainer,\n\n\n\n\n:properties\n\n\n\n\n{:maxDepth 8,\n\n\n\n\n:useRandomSplitPoints false,\n\n\n\n\n:fractionFeaturesInSplit 0.5}}],\n\n\n\n\n:tribuo-trainer-name random-forest}\n\n\n\n\nThe key observation is here, that the metamorph pipelines allow to not only grid-search over the model hyper-parameters, but as well over arbitrary pipeline variations, like which features to include. Both get handled in the same way.\nWe can get all results as well:\n\n(def evaluation-results-all\n (ml/evaluate-pipelines\n pipe-fns\n titanic-k-fold\n loss/classification-accuracy\n :accuracy\n {:map-fn :map\n :return-best-crossvalidation-only false\n :return-best-pipeline-only false}))\n\nIn total it creates and evaluates 4 models * 6 feature configurations * 5 CV = 120 models\n\n(-> evaluation-results-all flatten count)\n\n\n210\n\nWe can find the best as well by hand, it’s the first from the list, when sorted by accuracy.\n\n(-> (make-results-ds evaluation-results-all)\n (tc/unique-by)\n (tc/order-by [:mean-accuracy] :desc)\n (tc/head 20)\n (kind/table))\n\n\n\n\n\n\n\n\n\n\n\nused-features\nmean-accuracy\noptions\n\n\n\n\n\n[:sex :pclass :embarked]\n\n0.8110772551260077\n\n{:model-type :sklearn.classification/random-forest-classifier}\n\n\n\n\n[:sex :pclass :embarked]\n\n0.8110772551260077\n\n{:model-type :sklearn.classification/decision-tree-classifier}\n\n\n\n\n[:sex :pclass :embarked]\n\n0.8110772551260077\n\n{:model-type :xgboost/classification, :round 10}\n\n\n\n\n[:sex :pclass :embarked]\n\n0.8110772551260077\n\n{:model-type :scicloj.ml.tribuo/classification,\n :tribuo-components\n [{:name \"random-forest\",\n :type \"org.tribuo.classification.dtree.CARTClassificationTrainer\",\n :properties\n {:maxDepth \"8\",\n :useRandomSplitPoints \"false\",\n :fractionFeaturesInSplit \"0.5\"}}],\n :tribuo-trainer-name \"random-forest\"}\n\n\n\n\n[:sex :pclass]\n\n0.7863327620135847\n\n{:model-type :scicloj.ml.tribuo/classification,\n :tribuo-components\n [{:name \"logistic\",\n :type \"org.tribuo.classification.sgd.linear.LinearSGDTrainer\"}],\n :tribuo-trainer-name \"logistic\"}\n\n\n\n\n[:sex :embarked]\n\n0.7863327620135847\n\n{:model-type :scicloj.ml.tribuo/classification,\n :tribuo-components\n [{:name \"logistic\",\n :type \"org.tribuo.classification.sgd.linear.LinearSGDTrainer\"}],\n :tribuo-trainer-name \"logistic\"}\n\n\n\n\n[:sex]\n\n0.7863327620135847\n\n{:model-type :scicloj.ml.tribuo/classification,\n :tribuo-components\n [{:name \"logistic\",\n :type \"org.tribuo.classification.sgd.linear.LinearSGDTrainer\"}],\n :tribuo-trainer-name \"logistic\"}\n\n\n\n\n[:sex :embarked]\n\n0.7863327620135847\n\n{:model-type :sklearn.classification/random-forest-classifier}\n\n\n\n\n[:sex]\n\n0.7863327620135847\n\n{:model-type :sklearn.classification/random-forest-classifier}\n\n\n\n\n[:sex :pclass]\n\n0.7863327620135847\n\n{:model-type :sklearn.classification/logistic-regression}\n\n\n\n\n[:sex :embarked]\n\n0.7863327620135847\n\n{:model-type :sklearn.classification/logistic-regression}\n\n\n\n\n[:sex]\n\n0.7863327620135847\n\n{:model-type :sklearn.classification/logistic-regression}\n\n\n\n\n[:sex :embarked]\n\n0.7863327620135847\n\n{:model-type :sklearn.classification/decision-tree-classifier}\n\n\n\n\n[:sex]\n\n0.7863327620135847\n\n{:model-type :xgboost/classification, :round 10}\n\n\n\n\n[:sex :embarked]\n\n0.7863327620135847\n\n{:model-type :xgboost/classification, :round 10}\n\n\n\n\n[:sex]\n\n0.7863327620135847\n\n{:model-type :sklearn.classification/decision-tree-classifier}\n\n\n\n\n[:sex]\n\n0.7863327620135847\n\n{:model-type :scicloj.ml.tribuo/classification,\n :tribuo-components\n [{:name \"random-forest\",\n :type \"org.tribuo.classification.dtree.CARTClassificationTrainer\",\n :properties\n {:maxDepth \"8\",\n :useRandomSplitPoints \"false\",\n :fractionFeaturesInSplit \"0.5\"}}],\n :tribuo-trainer-name \"random-forest\"}\n\n\n\n\n[:sex :pclass :embarked]\n\n0.7852091665079668\n\n{:model-type :scicloj.ml.tribuo/classification,\n :tribuo-components\n [{:name \"logistic\",\n :type \"org.tribuo.classification.sgd.linear.LinearSGDTrainer\"}],\n :tribuo-trainer-name \"logistic\"}\n\n\n\n\n[:sex :pclass :embarked]\n\n0.7750777629657843\n\n{:model-type :sklearn.classification/logistic-regression}\n\n\n\n\n[:sex :pclass]\n\n0.773973211451787\n\n{:model-type :sklearn.classification/random-forest-classifier}", "crumbs": [ "Tutorials", "8  AutoML using metamorph pipelines"