From a89064fa956282872ecc2133d4106fcedcd13c97 Mon Sep 17 00:00:00 2001
From: Richard Stotz <richardstotz@google.com>
Date: Mon, 23 Sep 2024 06:26:08 -0700
Subject: [PATCH] [YDF] Prepare release of PYDF 0.8.0

PiperOrigin-RevId: 677766701
---
 README.md                                     | 23 +------------------
 documentation/public/docs/hyperparameters.md  | 21 +++++++++--------
 .../port/python/CHANGELOG.md                  | 11 +++++----
 .../port/python/README.md                     | 18 ++++-----------
 .../port/python/config/setup.py               |  2 +-
 .../port/python/dev_requirements.txt          |  3 +--
 .../port/python/tools/release_windows.bat     |  2 +-
 .../port/python/ydf/dataset/dataset.py        |  5 ++--
 .../port/python/ydf/dataset/io/BUILD          |  2 ++
 .../python/ydf/dataset/io/dataset_io_types.py |  6 ++---
 .../port/python/ydf/model/generic_model.py    | 19 +++++++--------
 .../port/python/ydf/version.py                |  2 +-
 12 files changed, 45 insertions(+), 69 deletions(-)

diff --git a/README.md b/README.md
index 9d620bba..b0027105 100644
--- a/README.md
+++ b/README.md
@@ -9,7 +9,7 @@
 [![PyPI Downloads](https://img.shields.io/pypi/dm/ydf?style=flat-square)](https://pepy.tech/project/ydf)
 
 **YDF** (Yggdrasil Decision Forests) is a library to train, evaluate, interpret,
-and serve Random Forest, Gradient Boosted Decision Trees, and CART decision
+and serve Random Forest, Gradient Boosted Decision Trees, CART and Isolation
 forest models.
 
 See the [documentation](https://ydf.readthedocs.org/) for more information on
@@ -84,27 +84,6 @@ SaveModel("my_model", model.get());
 
 (based on [examples/beginner.cc](examples/beginner.cc))
 
-The same model can be trained in Python using TensorFlow Decision Forests as
-follows:
-
-```python
-import tensorflow_decision_forests as tfdf
-import pandas as pd
-
-# Load dataset in a Pandas dataframe.
-train_df = pd.read_csv("project/train.csv")
-
-# Convert dataset into a TensorFlow dataset.
-train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(train_df, label="my_label")
-
-# Train model
-model = tfdf.keras.RandomForestModel()
-model.fit(train_ds)
-
-# Export model.
-model.save("project/model")
-```
-
 ## Next steps
 
 Check the
diff --git a/documentation/public/docs/hyperparameters.md b/documentation/public/docs/hyperparameters.md
index 368529f5..a45820da 100644
--- a/documentation/public/docs/hyperparameters.md
+++ b/documentation/public/docs/hyperparameters.md
@@ -137,8 +137,8 @@ reasonable time.
 
 -   **Type:** Integer **Default:** 5 **Possible values:** min:1
 
--   Truncation of the cross-entropy NDCG loss. Only used with cross-entropy NDCG
-    loss i.e. `loss="XE_NDCG_MART"`
+-   Truncation of the cross-entropy NDCG loss (default 5). Only used with
+    cross-entropy NDCG loss i.e. `loss="XE_NDCG_MART"`
 
 #### [dart_dropout](https://github.com/google/yggdrasil-decision-forests/blob/main/yggdrasil_decision_forests/learner/gradient_boosted_trees/gradient_boosted_trees.proto)
 
@@ -172,18 +172,19 @@ reasonable time.
 
 -   **Type:** Real **Default:** 0.5 **Possible values:** min:0 max:1
 
--   EXPERIMENTAL. Weighting parameter for focal loss, positive samples weighted
-    by alpha, negative samples by (1-alpha). The default 0.5 value means no
-    active class-level weighting. Only used with focal loss i.e.
+-   EXPERIMENTAL, default 0.5. Weighting parameter for focal loss, positive
+    samples weighted by alpha, negative samples by (1-alpha). The default 0.5
+    value means no active class-level weighting. Only used with focal loss i.e.
     `loss="BINARY_FOCAL_LOSS"`
 
 #### [focal_loss_gamma](https://github.com/google/yggdrasil-decision-forests/blob/main/yggdrasil_decision_forests/learner/gradient_boosted_trees/gradient_boosted_trees.proto)
 
 -   **Type:** Real **Default:** 2 **Possible values:** min:0
 
--   EXPERIMENTAL. Exponent of the misprediction exponent term in focal loss,
-    corresponds to gamma parameter in https://arxiv.org/pdf/1708.02002.pdf. Only
-    used with focal loss i.e. `loss="BINARY_FOCAL_LOSS"`
+-   EXPERIMENTAL, default 2.0. Exponent of the misprediction exponent term in
+    focal loss, corresponds to gamma parameter in
+    https://arxiv.org/pdf/1708.02002.pdf. Only used with focal loss i.e.
+    `loss="BINARY_FOCAL_LOSS"`
 
 #### [forest_extraction](https://github.com/google/yggdrasil-decision-forests/blob/main/yggdrasil_decision_forests/learner/gradient_boosted_trees/gradient_boosted_trees.proto)
 
@@ -365,8 +366,8 @@ reasonable time.
 
 -   **Type:** Integer **Default:** 5 **Possible values:** min:1
 
--   Truncation of the NDCG loss. Only used with NDCG loss i.e.
-    `loss="LAMBDA_MART_NDCG"`
+-   Truncation of the NDCG loss (default 5). Only used with NDCG loss i.e.
+    `loss="LAMBDA_MART_NDCG".`
 
 #### [num_candidate_attributes](https://github.com/google/yggdrasil-decision-forests/blob/main/yggdrasil_decision_forests/learner/decision_tree/decision_tree.proto)
 
diff --git a/yggdrasil_decision_forests/port/python/CHANGELOG.md b/yggdrasil_decision_forests/port/python/CHANGELOG.md
index 6652a1c2..fa6e1d19 100644
--- a/yggdrasil_decision_forests/port/python/CHANGELOG.md
+++ b/yggdrasil_decision_forests/port/python/CHANGELOG.md
@@ -1,6 +1,6 @@
 # Changelog
 
-## HEAD
+## 0.8.0 - 2024-09-23
 
 ### Breaking
 
@@ -22,7 +22,7 @@
 -   Add `num_examples_per_tree()` method to Isolation Forest models.
 -   Expose the slow engine for debugging predictions and evaluations with
     `use_slow_engine=True`.
--   Speed-up training of GBT models by ~10%
+-   Speed-up training of GBT models by ~10%.
 -   Support for categorical and boolean features in Isolation Forests.
 -   Add `ydf.util.read_tf_record` and `ydf.util.write_tf_record` to facilitate
     TF Record datasets usage.
@@ -36,14 +36,17 @@
 -   Add argument to control the maximum duration of `model.analyze`.
 -   Add support for Unicode strings, normalize categorical set values in the
     same way as categorical values, and validate their types.
--   Native support for PyGrain DataLoader and Dataset for all operations (e.g.,
-    training, evaluation, predictions).
 -   Add support for distributed training for ranking gradient boosted tree
     models.
 
 ### Fix
 
 -   Fix labels of regression evaluation plots
+-   Improved errors if Isolation Forest training fails.
+
+### Release music
+
+Perpetuum Mobile "Ein musikalischer Scherz", Op. 257. Johann Strauss (Sohn)
 
 ## 0.7.0 - 2024-08-21
 
diff --git a/yggdrasil_decision_forests/port/python/README.md b/yggdrasil_decision_forests/port/python/README.md
index 5ca4ce7c..a1b46bd7 100644
--- a/yggdrasil_decision_forests/port/python/README.md
+++ b/yggdrasil_decision_forests/port/python/README.md
@@ -5,10 +5,10 @@ Decision Forests. It allows direct, fast access to YDF's methods and it also
 offers advanced import / export, evaluation and inspection methods. While the
 package is called YDF, the wrapping code is sometimes lovingly called *PYDF*.
 
-It is not a replacement for its sister project 
+YDF is the successor of
 [Tensorflow Decision Forests](https://github.com/tensorflow/decision-forests) 
-(TF-DF). Instead, it complements TF-DF for use cases that cannot be solved 
-through the Keras API.
+(TF-DF). TF-DF is still maintained, but new projects should choose YDF for
+improved performance, better model quality and more features.
 
 ## Installation
 
@@ -41,15 +41,5 @@ loaded_model = ydf.load_model("my_model")
 
 ## Frequently Asked Questions
 
-*   **Is it PYDF or YDF?** The name of the library is simply ydf, and so is the
-    name of the corresponding Pip package. Internally, the team sometimes uses
-    the name *PYDF* because it fits so well.
-*   **What is the status of PYDF?** PYDF is currently in Alpha development. Most
-    parts already work well (training, evaluation, predicting, export), some new
-    features are yet to come. The API surface is mostly stable but may still 
-    change without notice.
-*   **Where is the documentation for PYDF?** The documentation is
-    available on https://ydf.readthedocs.org.
-*   **How should I pronounce PYDF?** The preferred pronunciation is 
-    "Py-dee-eff" / ˈpaɪˈdiˈɛf (IPA)
+See the [FAQ](https://ydf.readthedocs.io/en/latest/faq/) in the documentation.
 
diff --git a/yggdrasil_decision_forests/port/python/config/setup.py b/yggdrasil_decision_forests/port/python/config/setup.py
index 29d657d7..a6e5c3c3 100644
--- a/yggdrasil_decision_forests/port/python/config/setup.py
+++ b/yggdrasil_decision_forests/port/python/config/setup.py
@@ -22,7 +22,7 @@
 from setuptools.command.install import install
 from setuptools.dist import Distribution
 
-_VERSION = "0.7.0"
+_VERSION = "0.8.0"
 
 with open("README.md", "r", encoding="utf-8") as fh:
   long_description = fh.read()
diff --git a/yggdrasil_decision_forests/port/python/dev_requirements.txt b/yggdrasil_decision_forests/port/python/dev_requirements.txt
index 1e66089d..b8448cb8 100644
--- a/yggdrasil_decision_forests/port/python/dev_requirements.txt
+++ b/yggdrasil_decision_forests/port/python/dev_requirements.txt
@@ -13,5 +13,4 @@ jax; platform_machine != 'aarch64' and platform_system != 'Windows'
 jaxlib; platform_machine != 'aarch64' and platform_system != 'Windows'
 optax; platform_machine != 'aarch64' and platform_system != 'Windows' and python_version >= '3.9'
 flatbuffers; platform_machine != 'aarch64' and platform_system != 'Windows' and python_version >= '3.12'
-tensorflow-datasets; platform_machine != 'aarch64' and platform_system != 'Windows' and python_version >= '3.9'
-grain
\ No newline at end of file
+tensorflow-datasets; platform_machine != 'aarch64' and platform_system != 'Windows' and python_version >= '3.9'
\ No newline at end of file
diff --git a/yggdrasil_decision_forests/port/python/tools/release_windows.bat b/yggdrasil_decision_forests/port/python/tools/release_windows.bat
index 814613c4..a391e85f 100644
--- a/yggdrasil_decision_forests/port/python/tools/release_windows.bat
+++ b/yggdrasil_decision_forests/port/python/tools/release_windows.bat
@@ -34,7 +34,7 @@
 cls
 setlocal
 
-set YDF_VERSION=0.7.0
+set YDF_VERSION=0.8.0
 set BAZEL=bazel.exe
 set BAZEL_SH=C:\msys64\usr\bin\bash.exe
 set BAZEL_FLAGS=--config=windows_cpp20 --config=windows_avx2
diff --git a/yggdrasil_decision_forests/port/python/ydf/dataset/dataset.py b/yggdrasil_decision_forests/port/python/ydf/dataset/dataset.py
index 0eddd341..a1deb353 100644
--- a/yggdrasil_decision_forests/port/python/ydf/dataset/dataset.py
+++ b/yggdrasil_decision_forests/port/python/ydf/dataset/dataset.py
@@ -372,8 +372,9 @@ def create_vertical_dataset(
   Args:
     data: Source dataset. Supported formats: VerticalDataset, (typed) path, list
       of (typed) paths, Pandas DataFrame, Xarray Dataset, TensorFlow Dataset,
-      PyGrain DataLoader and Dataset, dictionary of string to NumPy array or
-      lists. If the data is already a VerticalDataset, it is returned unchanged.
+      PyGrain DataLoader and Dataset (experimental, Linux only), dictionary of
+      string to NumPy array or lists. If the data is already a VerticalDataset,
+      it is returned unchanged.
     columns: If None, all columns are imported. The semantic of the columns is
       determined automatically. Otherwise, if include_all_columns=False
       (default) only the column listed in `columns` are imported. If
diff --git a/yggdrasil_decision_forests/port/python/ydf/dataset/io/BUILD b/yggdrasil_decision_forests/port/python/ydf/dataset/io/BUILD
index a5ea8db5..167225a7 100644
--- a/yggdrasil_decision_forests/port/python/ydf/dataset/io/BUILD
+++ b/yggdrasil_decision_forests/port/python/ydf/dataset/io/BUILD
@@ -98,6 +98,8 @@ py_test(
 py_test(
     name = "pygrain_io_test",
     srcs = ["pygrain_io_test.py"],
+    # TODO: Figure out what to do with Pygrain support, since it does not work on MacOS.
+    tags = ["manual"],  # Grain is not supported on MacOS
     deps = [
         ":dataset_io_types",
         ":pygrain_io",
diff --git a/yggdrasil_decision_forests/port/python/ydf/dataset/io/dataset_io_types.py b/yggdrasil_decision_forests/port/python/ydf/dataset/io/dataset_io_types.py
index a0550915..9f639ae4 100644
--- a/yggdrasil_decision_forests/port/python/ydf/dataset/io/dataset_io_types.py
+++ b/yggdrasil_decision_forests/port/python/ydf/dataset/io/dataset_io_types.py
@@ -85,9 +85,9 @@
   3. A Xarray dataset.
   4. A YDF VerticalDataset created with `ydf.create_vertical_dataset`. This option is the most efficient when the same dataset is used multiple times.
   5. A batched TensorFlow Dataset.
-  6. A PyGrain DataLoader or Dataset.
-  7. A typed path to a csv file e.g. "csv:/tmp/dataset.csv". See supported types below. The path can be sharded (e.g. "csv:/tmp/dataset@10") or globbed ("csv:/tmp/dataset*").
-  8. A list of typed paths e.g. ["csv:/tmp/data1.csv", "csv:/tmp/data2.csv"]. See supported types below.
+  6. A typed path to a csv file e.g. "csv:/tmp/dataset.csv". See supported types below. The path can be sharded (e.g. "csv:/tmp/dataset@10") or globbed ("csv:/tmp/dataset*").
+  7. A list of typed paths e.g. ["csv:/tmp/data1.csv", "csv:/tmp/data2.csv"]. See supported types below.
+  8. A PyGrain DataLoader or Dataset (experimental, Linux only).
 
 The supported file formats and corresponding prefixes are:
   - CSV file. prefix 'csv:'
diff --git a/yggdrasil_decision_forests/port/python/ydf/model/generic_model.py b/yggdrasil_decision_forests/port/python/ydf/model/generic_model.py
index dbffb18c..e40bf892 100644
--- a/yggdrasil_decision_forests/port/python/ydf/model/generic_model.py
+++ b/yggdrasil_decision_forests/port/python/ydf/model/generic_model.py
@@ -418,8 +418,9 @@ def predict(
     Args:
       data: Dataset. Supported formats: VerticalDataset, (typed) path, list of
         (typed) paths, Pandas DataFrame, Xarray Dataset, TensorFlow Dataset,
-        PyGrain DataLoader and Dataset, dictionary of string to NumPy array or
-        lists. If the dataset contains the label column, that column is ignored.
+        PyGrain DataLoader and Dataset (experimental, Linux only), dictionary of
+        string to NumPy array or lists. If the dataset contains the label
+        column, that column is ignored.
       use_slow_engine: If true, uses the slow engine for making predictions. The
         slow engine of YDF is an order of magnitude slower than the other
         prediction engines. There exist very rare edge cases where predictions
@@ -506,8 +507,8 @@ def evaluate(
     Args:
       data: Dataset. Supported formats: VerticalDataset, (typed) path, list of
         (typed) paths, Pandas DataFrame, Xarray Dataset, TensorFlow Dataset,
-        PyGrain DataLoader and Dataset, dictionary of string to NumPy array or
-        lists.
+        PyGrain DataLoader and Dataset (experimental, Linux only), dictionary of
+        string to NumPy array or lists.
       weighted: If true, the evaluation is weighted according to the training
         weights. If false, the evaluation is non-weighted. b/351279797: Change
         default to weights=True.
@@ -655,8 +656,8 @@ def analyze_prediction(
     Args:
       single_example: Example to explain. Supported formats: VerticalDataset,
         (typed) path, list of (typed) paths, Pandas DataFrame, Xarray Dataset,
-        TensorFlow Dataset, PyGrain DataLoader and Dataset, dictionary of string
-        to NumPy array or lists.
+        TensorFlow Dataset, PyGrain DataLoader and Dataset (experimental, Linux
+        only), dictionary of string to NumPy array or lists.
 
     Returns:
       Prediction explanation.
@@ -714,8 +715,8 @@ def analyze(
     Args:
       data: Dataset. Supported formats: VerticalDataset, (typed) path, list of
         (typed) paths, Pandas DataFrame, Xarray Dataset, TensorFlow Dataset,
-        PyGrain DataLoader and Dataset, dictionary of string to NumPy array or
-        lists.
+        PyGrain DataLoader and Dataset (experimental, Linux only), dictionary of
+        string to NumPy array or lists.
       sampling: Ratio of examples to use for the analysis. The analysis can be
         expensive to compute. On large datasets, use a small sampling value e.g.
         0.01.
@@ -1463,7 +1464,7 @@ def _build_evaluation_dataspec(
     effective_dataspec = self._model.data_spec()
 
     def find_existing_or_add_column(
-        semantic: Optional[data_spec_pb2.ColumnType],
+        semantic: Optional[Any],
         name: Optional[str],
         default_col_idx: int,
         usage: str,
diff --git a/yggdrasil_decision_forests/port/python/ydf/version.py b/yggdrasil_decision_forests/port/python/ydf/version.py
index 5bd6b189..af929250 100644
--- a/yggdrasil_decision_forests/port/python/ydf/version.py
+++ b/yggdrasil_decision_forests/port/python/ydf/version.py
@@ -12,4 +12,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-version = "0.7.0"
+version = "0.8.0"