ml xgboosst class spec

Open-EO · Dec 12, 2023 · d063bbd · d063bbd
1 parent b162040
commit d063bbd
Show file tree

Hide file tree

Showing 3 changed files with 122 additions and 0 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -19,6 +19,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
     - `load_ml_model`
     - `load_url`
     - `ml_fit_class_random_forest`
+    - `ml_fit_class_xgboost`
     - `ml_fit_regr_random_forest`
     - `ml_predict`
     - `save_ml_model`

diff --git a/proposals/ml_fit_class_xgboost.json b/proposals/ml_fit_class_xgboost.json
@@ -0,0 +1,115 @@
+{
+    "id": "ml_fit_class_xgboost",
+    "summary": "Train an XGBoost classification model",
+    "description": "Fit an XGBoost classification model to training data. XGBoost is a high-performance, flexible, and portable distributed gradient boosting library. It implements machine lSubsamplening algorithms within the Gradient Boosting framework, featuring parallel tree boosting for efficiency",
+    "categories": [
+        "machine learning"
+    ],
+    "experimental": true,
+    "parameters": [
+        {
+            "name": "predictors",
+            "description": "The predictors for the XGBoost classification model as a vector data cube. They are the independent variables that the XGBoost algorithm analyses to learn patterns and relationships within the data.",
+            "schema": {
+                "type": "object",
+                "subtype": "datacube",
+                "dimensions": [
+                    {
+                        "type": "geometry"
+                    },
+                    {
+                        "type": "bands"
+                    }
+                ]
+            }
+        },
+        {
+            "name": "target",
+            "description": "Labeled data for XGBoost classification, aligning with predictor values based on a shared geometry dimension. This ensures a clear connection between predictor rows and labels.",
+            "schema": {
+                "type": "object",
+                "subtype": "datacube",
+                "dimensions": [
+                    {
+                        "type": "geometry"
+                    }
+                ]
+            }
+        },
+        {
+            "name": "learning_rate",
+            "description": "Step size shrinkage used in update to prevent overfitting.",
+            "schema": {
+                "type": "number",
+                "minimum": 0,
+                "default": 0.15
+            }
+        },
+        {
+            "name": "max_depth",
+            "description": "Maximum depth of a tree.",
+            "schema": {
+                "type": "integer",
+                "minimum": 1,
+                "default": 5
+            }
+        },
+        {
+            "name": "min_child_weight",
+            "description": "Minimum sum of instance weight (hessian) needed in a child.",
+            "schema": {
+                "type": "number",
+                "minimum": 0,
+                "default": 1
+            }
+        },
+        {
+            "name": "subsample",
+            "description": "Subsample ratio of the training instance.",
+            "optional": true,
+            "default": 0.8,
+            "schema": {
+                "type": "number",
+                "minimum": 0,
+                "maximum": 1
+            }
+        },
+        {
+            "name": "min_split_loss",
+            "description": "Minimum loss reduction required to make a further partition on a leaf node of the tree.",
+            "optional": true,
+            "default": 1,
+            "schema": {
+                "type": "number",
+                "minimum": 0
+            }
+        },
+        {
+            "name": "seed",
+            "description": "A randomization seed to use for the random sampling in training. If not given or `null`, no seed is used and results may differ on subsequent use.",
+            "optional": true,
+            "default": null,
+            "schema": {
+                "type": [
+                    "integer",
+                    "null"
+                ]
+            }
+        }
+    ],
+    "returns": {
+        "description": "A model object that can be saved with `save_ml_model()` and restored with `load_ml_model()`.",
+        "schema": {
+            "type": "object",
+            "subtype": "ml-model"
+        }
+    },
+    "links": [
+        {
+            "href": "https://dl.acm.org/doi/10.1145/2939672.2939785",
+            "title": "Chen and Guestrin (2016), XGBoost: A Scalable Tree Boosting System",
+            "type": "text/html",
+            "rel": "about"
+        }
+    ]
+}
diff --git a/tests/.words b/tests/.words
@@ -47,3 +47,9 @@ Hyndman
 date1
 date2
 favor
+XGBoost
+Chen
+Guestrin
+Subsample
+hessian
+overfitting