diff --git a/notebooks/2.0-mb-data_preprocessing_loading_splitting.ipynb b/notebooks/2.0-mb-data_preprocessing_loading_splitting.ipynb index 151d0af4..c28fbabf 100644 --- a/notebooks/2.0-mb-data_preprocessing_loading_splitting.ipynb +++ b/notebooks/2.0-mb-data_preprocessing_loading_splitting.ipynb @@ -3351,7 +3351,7 @@ } ], "source": [ - "# use shap instead of feature importance to maintain consistency throughout the work\n", + "# use shap to maintain consistency throughout the work\n", "model.fit(X, y)\n", "explainer = shap.TreeExplainer(model)\n", "shap_values = explainer.shap_values(Pool(X, y, cat_features=cat_features))\n", @@ -3804,9 +3804,10 @@ "outputs": [], "source": [ "# randomly sample 10 % of rows\n", - "train = train.sample(frac=0.1, random_state=seed)\n", - "valid = valid.sample(frac=0.1, random_state=seed)\n", - "test = test.sample(frac=0.1, random_state=seed)\n" + "frac= 0.1\n", + "train = train.sample(frac=frac, random_state=seed)\n", + "valid = valid.sample(frac=frac, random_state=seed)\n", + "test = test.sample(frac=frac, random_state=seed)\n" ] }, { @@ -4159,7 +4160,9 @@ } ], "source": [ + "# number of samples compared\n", "n = 20\n", + "\n", "ser_train = train[\"ROOT\"].value_counts()[:n]\n", "ser_valid = valid[\"ROOT\"].value_counts()[:n]\n", "ser_test = test[\"ROOT\"].value_counts()[:n]\n",