diff --git a/.gitignore b/.gitignore index b871fd9..73cd115 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ +.DS_Store .idea/ __pycache__ build @@ -7,3 +8,4 @@ dist *.egg-info .egg-info .ipynb_checkpoints +tmp diff --git a/README.md b/README.md index 92738e0..3029c7c 100644 --- a/README.md +++ b/README.md @@ -26,7 +26,9 @@ Alternatively, if you want to customize the source code, you may install in the pip install -e . ``` -in the cloned directory. +in the cloned directory. Note this will download some large (about 100MB in total) files to warm-start TensorOboe fitting, so that the setup time (in minutes) can be saved at the cost of disk space and network data usage. + +It is recommended to install within an isolated environment (a conda virtual environment, for example) to avoid conflicting dependency versions. #### Dependencies with verified versions @@ -36,14 +38,14 @@ The Oboe systems work on Python 3.7 or later. The following libraries are requir * scipy (1.4.1) * pandas (0.24.2) * scikit-learn (0.22.1) -* tensorly (0.4.4) +* tensorly (0.6.0) * OpenML (0.9.0) * mkl (>=1.0.0) ## Examples -For more detailed examples, please refer to the Jupyter notebooks in the `example` folder. A basic classification example: +For more detailed examples, please refer to the Jupyter notebooks in the `example` folder. A basic classification example using Oboe: ```python method = 'Oboe' # 'Oboe' or 'TensorOboe' @@ -69,6 +71,11 @@ print("selected models: {}".format(m.get_models())) ``` +## Warm-start meta-training + +The `large_files` folder includes some large `numpy` arrays that are intermediate results of previous meta-training. This folder is not included in the `pip` installation, and the files within it can be manually downloaded from this GitHub repository. + +The default functionality in TensorOboe is to skip the step of imputing missing entries in the error tensor, and directly use the pre-imputed error tensor. If users desire to impute the error tensor by themselves, the original non-imputed error tensor can be found at `large_files/error_tensor_f16_compressed.npz`, and the TensorOboe initialization can be done by setting the `original_error_tensor_dir` argument to the path of this `.npz` file, and setting `mode` to `'initialize'` when creating the AutoLearner instance: `m = AutoLearner(..., method='TensorOboe', mode='initialize', path_to_imputed_error_tensor=)`. ## References [1] Chengrun Yang, Yuji Akimoto, Dae Won Kim, Madeleine Udell. OBOE: Collaborative filtering for AutoML model selection. KDD 2019. diff --git a/examples/classification_by_Oboe.ipynb b/examples/classification_by_Oboe.ipynb index 33b90c5..37ccdb9 100644 --- a/examples/classification_by_Oboe.ipynb +++ b/examples/classification_by_Oboe.ipynb @@ -13,41 +13,25 @@ "metadata": {}, "outputs": [], "source": [ - "# necessary modules\n", - "import sys\n", - "import pandas as pd\n", - "import os\n", - "import time\n", - "import numpy as np\n", - "import multiprocessing\n", + "method = 'Oboe'\n", + "problem_type = 'classification'\n", "\n", - "#import AutoLearner module by either specifying its relative path or doing pip installation\n", - "automl_path = '../automl'\n", - "sys.path.append(automl_path)\n", - "from auto_learner import AutoLearner\n", - "import util\n", + "from oboe import AutoLearner, error # This may take around 15 seconds at first run.\n", "\n", - "#import scikit-learn modules\n", + "import numpy as np\n", "from sklearn.datasets import load_iris\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.metrics import accuracy_score\n", + "import time\n", "\n", - "# disable warnings\n", - "import warnings\n", - "warnings.filterwarnings('ignore')" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "#load and split dataset into training and test folds\n", "data = load_iris()\n", "x = np.array(data['data'])\n", "y = np.array(data['target'])\n", - "x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)" + "x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)\n", + "\n", + "# disable warnings\n", + "import warnings\n", + "warnings.filterwarnings('ignore')" ] }, { @@ -59,17 +43,17 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "# initialize the autolearner class\n", - "m = AutoLearner(p_type='classification', runtime_limit=30, method='Oboe', verbose=False)" + "m = AutoLearner(p_type='classification', runtime_limit=30, method=method, verbose=False)" ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -81,42 +65,42 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "prediction error: 0.0\n", - "elapsed time: 29.56807851791382\n" + "prediction error: 0.04444444444444442\n", + "elapsed time: 27.44637703895569\n" ] } ], "source": [ "# use the fitted autolearner for prediction on test set\n", "y_predicted = m.predict(x_test)\n", - "print(\"prediction error: {}\".format(util.error(y_test, y_predicted, 'classification'))) \n", + "print(\"prediction error: {}\".format(error(y_test, y_predicted, 'classification'))) \n", "print(\"elapsed time: {}\".format(elapsed_time))" ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'ensemble method': 'select at most 5 pipelines with smallest cv error',\n", - " 'base learners': {'kSVM': [{'C': 4, 'kernel': 'rbf', 'coef0': 0},\n", - " {'C': 4, 'kernel': 'rbf', 'coef0': 10},\n", - " {'C': 16, 'kernel': 'rbf', 'coef0': 10},\n", - " {'C': 8, 'kernel': 'rbf', 'coef0': 0},\n", - " {'C': 4, 'kernel': 'rbf', 'coef0': 10}]}}" + " 'base learners': {'kSVM': [{'C': 0.5, 'kernel': 'poly', 'coef0': 10},\n", + " {'C': 0.25, 'kernel': 'poly', 'coef0': 10},\n", + " {'C': 0.125, 'kernel': 'poly', 'coef0': 10},\n", + " {'C': 0.125, 'kernel': 'poly', 'coef0': 10},\n", + " {'C': 0.125, 'kernel': 'poly', 'coef0': 10}]}}" ] }, - "execution_count": 6, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -135,7 +119,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ @@ -147,7 +131,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ @@ -157,13 +141,14 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "#autolearner arguments\n", "autolearner_kwargs = {\n", " 'p_type': 'classification',\n", + " 'method': method,\n", " 'runtime_limit': RUNTIME_BUDGET,\n", " 'verbose': VERBOSE,\n", " 'selection_method': 'ED',\n", @@ -176,7 +161,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ @@ -186,7 +171,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ @@ -198,30 +183,30 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "prediction error: 0.025438596491228094\n", - "elapsed time: 29.11527419090271\n", - "individual accuracies of selected models: [0.0, 0.0, 0.025, 0.025, 0.025]\n" + "prediction error: 0.07037037037037036\n", + "elapsed time: 27.255645036697388\n", + "individual accuracies of selected models: [0.07, 0.07, 0.07, 0.07, 0.07]\n" ] } ], "source": [ "# use the fitted autolearner for prediction on test set\n", "y_predicted = m.predict(x_test)\n", - "print(\"prediction error: {}\".format(util.error(y_test, y_predicted, 'classification')))\n", + "print(\"prediction error: {}\".format(error(y_test, y_predicted, 'classification')))\n", "print(\"elapsed time: {}\".format(elapsed_time))\n", "print(\"individual accuracies of selected models: {}\".format(m.get_model_accuracies(y_test)))" ] }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 12, "metadata": { "scrolled": true }, @@ -230,14 +215,14 @@ "data": { "text/plain": [ "{'ensemble method': 'select at most 5 pipelines with smallest cv error',\n", - " 'base learners': {'KNN': [{'n_neighbors': 5, 'p': 2},\n", - " {'n_neighbors': 5, 'p': 2}],\n", - " 'ExtraTrees': [{'min_samples_split': 2, 'criterion': 'entropy'},\n", - " {'min_samples_split': 0.01, 'criterion': 'entropy'},\n", - " {'min_samples_split': 2, 'criterion': 'entropy'}]}}" + " 'base learners': {'KNN': [{'n_neighbors': 1, 'p': 1},\n", + " {'n_neighbors': 7, 'p': 2},\n", + " {'n_neighbors': 3, 'p': 1},\n", + " {'n_neighbors': 9, 'p': 1},\n", + " {'n_neighbors': 3, 'p': 2}]}}" ] }, - "execution_count": 13, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -256,7 +241,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ @@ -268,7 +253,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 14, "metadata": {}, "outputs": [], "source": [ @@ -278,14 +263,14 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "#autolearner arguments\n", "autolearner_kwargs = {\n", - " 'method': 'Oboe',\n", " 'p_type': 'classification',\n", + " 'method': method,\n", " 'runtime_limit': RUNTIME_BUDGET,\n", " 'verbose': VERBOSE,\n", " 'selection_method': 'ED',\n", @@ -298,7 +283,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 16, "metadata": {}, "outputs": [], "source": [ @@ -308,9 +293,28 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 17, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Shape of training dataset: 120 data points, 4 features\n", + "Fitting with k=7, t=7.5\n", + "\n", + "Single round runtime target: 7.5\n", + "Fitting AutoLearner with max runtime 7.5s\n", + "Sampling 39 entries of new row...\n", + "Time limit reached.\n", + "KNN {'n_neighbors': 1, 'p': 2} complete.\n", + "KNN {'n_neighbors': 1, 'p': 1} complete.\n", + "KNN {'n_neighbors': 3, 'p': 1} complete.\n", + "DT {'min_samples_split': 64} complete.\n" + ] + } + ], "source": [ "# fit autolearner on training set and record runtime\n", "start = time.time()\n", @@ -320,15 +324,15 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 18, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "elapsed time: 14.332764387130737\n", - "accuracies of selected models: [0.025, 0.025, 0.025, 0.025, 0.0, 0.025, 0.025, 0.0, 0.0, 0.025, 0.025, 0.025, 0.051]\n" + "elapsed time: 12.52166485786438\n", + "accuracies of selected models: [0.07, 0.07, 0.07, 0.07, 0.096]\n" ] } ], @@ -349,28 +353,20 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "{'selected models': {'KNN': [{'n_neighbors': 1, 'p': 1},\n", - " {'n_neighbors': 1, 'p': 1},\n", + "{'selected models': {'KNN': [{'n_neighbors': 1, 'p': 2},\n", " {'n_neighbors': 1, 'p': 2},\n", - " {'n_neighbors': 3, 'p': 1},\n", - " {'n_neighbors': 5, 'p': 1},\n", - " {'n_neighbors': 3, 'p': 2},\n", - " {'n_neighbors': 7, 'p': 1},\n", - " {'n_neighbors': 5, 'p': 2},\n", - " {'n_neighbors': 7, 'p': 2}],\n", - " 'DT': [{'min_samples_split': 1e-05},\n", - " {'min_samples_split': 0.0001},\n", - " {'min_samples_split': 2},\n", - " {'min_samples_split': 32}]}}" + " {'n_neighbors': 1, 'p': 1},\n", + " {'n_neighbors': 3, 'p': 1}],\n", + " 'DT': [{'min_samples_split': 64}]}}" ] }, - "execution_count": 20, + "execution_count": 19, "metadata": {}, "output_type": "execute_result" } @@ -383,9 +379,9 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "test", "language": "python", - "name": "python3" + "name": "test" }, "language_info": { "codemirror_mode": { @@ -397,7 +393,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.3" + "version": "3.8.11" } }, "nbformat": 4, diff --git a/examples/classification_by_TensorOboe.ipynb b/examples/classification_by_TensorOboe.ipynb index 3b2eac3..5e75dd9 100644 --- a/examples/classification_by_TensorOboe.ipynb +++ b/examples/classification_by_TensorOboe.ipynb @@ -13,24 +13,21 @@ "metadata": {}, "outputs": [], "source": [ - "# necessary modules\n", - "import sys\n", - "import pandas as pd\n", - "import os\n", - "import time\n", - "import numpy as np\n", - "import multiprocessing\n", + "method = 'tensoroboe' # 'Oboe' or 'TensorOboe'\n", + "problem_type = 'classification'\n", "\n", - "#import AutoLearner module by either specifying its relative path or doing pip installation\n", - "automl_path = '../automl'\n", - "sys.path.append(automl_path)\n", - "from auto_learner import AutoLearner\n", - "import util\n", + "from oboe import AutoLearner, error # This may take around 15 seconds at first run.\n", "\n", - "#import scikit-learn modules\n", + "import numpy as np\n", "from sklearn.datasets import load_iris\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.metrics import accuracy_score\n", + "import time\n", + "\n", + "data = load_iris()\n", + "x = np.array(data['data'])\n", + "y = np.array(data['target'])\n", + "x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)\n", "\n", "# disable warnings\n", "import warnings\n", @@ -38,35 +35,29 @@ ] }, { - "cell_type": "code", - "execution_count": 2, + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "#load and split dataset into training and test folds\n", - "data = load_iris()\n", - "x = np.array(data['data'])\n", - "y = np.array(data['target'])\n", - "x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)\n", - "categorical = [False for _ in range(x.shape[1])] # a Boolean list of feature types: categorical or not" + "# Example 1: a no-brainer use" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "# Example 1: a no-brainer use" + "The default `TensorOboe` running mode is `warm`, which means the meta-training is warm-started with pre-imputed error tensor." ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ + "rank for EM-Tucker imputation: (20, 4, 2, 2, 8, 20)\n", "shape of the error tensor: (551, 4, 2, 2, 8, 183)\n", "Loading latent factors from storage ...\n", "Loading saved runtime predictors ...\n" @@ -75,7 +66,7 @@ ], "source": [ "# initialize the autolearner class\n", - "m = AutoLearner(p_type='classification', runtime_limit=100, method='TensorOboe', verbose=True)" + "m = AutoLearner(p_type='classification', runtime_limit=50, method=method, verbose=True)" ] }, { @@ -87,6 +78,13 @@ "name": "stdout", "output_type": "stream", "text": [ + "\n", + "Shape of training dataset: 120 data points, 4 features\n", + "Splitting training set into training and validation ..\n", + "Predicting pipeline running time ..\n", + "runtime limit of initial round: 32.0 seconds\n", + "fitting and kfold_fit_validating the best-on-average pipeline\n", + "Pipeline fitting completed.\n", "Fitted an ensemble with size 1\n", "having a capped running time of 32 seconds\n", "Fitted an ensemble with size 1\n", @@ -94,86 +92,78 @@ "Fitted an ensemble with size 1\n", "Fitted an ensemble with size 1\n", "Fitted an ensemble with size 1\n", + "Doubling process started ...\n", + "Fitting with ranks=(20, 4, 2, 2, 8, 18), t=32.0\n", + "\n", + "Single round runtime target: 32.0\n", + "Fitting AutoLearner with maximum runtime 32.0 seconds\n", + "Selecting an initial set of models to evaluate ...\n", + "greedy_initialization\n", + "[0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5]\n", + "Sampling 8 entries of new row...\n", + "pool fitting completed\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "length of sampled indices: 8\n", + "[13985, 18377, 22769, 14168, 8312] candidate learners need to be k-fold fitted\n", + "Fitting 5 pipelines predicted to be the best ...\n", + "Number of candidate learners in the ensemble: 11\n", + "\n", + "Fitting ensemble of maximum size 11...\n", "Fitting a candidate learners not fitted before ..\n", + "Pipeline fitting completed.\n", "Fitting a candidate learners not fitted before ..\n", + "Pipeline fitting completed.\n", "Fitting a candidate learners not fitted before ..\n", + "Pipeline fitting completed.\n", "Fitting a candidate learners not fitted before ..\n", + "Pipeline fitting completed.\n", "Fitting a candidate learners not fitted before ..\n", + "Pipeline fitting completed.\n", "Fitting a candidate learners not fitted before ..\n", + "Pipeline fitting completed.\n", "Fitting a candidate learners not fitted before ..\n", + "Pipeline fitting completed.\n", "Fitting a candidate learners not fitted before ..\n", + "Pipeline fitting completed.\n", "Fitting a candidate learners not fitted before ..\n", - "cv errors: [0.03532407 0.04989418 0.05244009 0.05244009 0.06401416 0.03532407\n", - " 0.05244009 0.06401416 0.04917639 0.09327692 0.04845199 0.04917639]\n", - "Fitted an ensemble with size 5\n", - "having a capped running time of 47 seconds\n", - "cv errors: [0.03532407 0.04989418 0.05244009 0.05244009 0.06401416 0.03532407\n", - " 0.05244009 0.06401416 0.04917639 0.09327692 0.04845199 0.04917639]\n", - "Fitted an ensemble with size 5\n", - "cv errors: [0.03532407 0.04989418 0.05244009 0.05244009 0.06401416 0.03532407\n", - " 0.05244009 0.06401416 0.04917639 0.09327692 0.04845199 0.04917639]\n", - "Fitted an ensemble with size 5\n", - "cv errors: [0.03532407 0.04989418 0.05244009 0.05244009 0.06401416 0.03532407\n", - " 0.05244009 0.06401416 0.04917639 0.09327692 0.04845199 0.04917639]\n", - "Fitted an ensemble with size 5\n", - "cv errors: [0.03532407 0.04989418 0.05244009 0.05244009 0.06401416 0.03532407\n", - " 0.05244009 0.06401416 0.04917639 0.09327692 0.04845199 0.04917639]\n", - "Fitted an ensemble with size 5\n", - "cv errors: [0.03532407 0.04989418 0.05244009 0.05244009 0.06401416 0.03532407\n", - " 0.05244009 0.06401416 0.04917639 0.09327692 0.04845199 0.04917639]\n", - "Fitted an ensemble with size 5\n", - "Fitting a candidate learners not fitted before ..\n", - "Fitting a candidate learners not fitted before ..\n", - "Fitting a candidate learners not fitted before ..\n", - "Fitting a candidate learners not fitted before ..\n", - "Fitting a candidate learners not fitted before ..\n", - "Fitting a candidate learners not fitted before ..\n", - "Fitting a candidate learners not fitted before ..\n", + "Pipeline fitting completed.\n", "Fitting a candidate learners not fitted before ..\n", - "cv errors: [0.03532407 0.04989418 0.05244009 0.05244009 0.06401416 0.03532407\n", - " 0.05244009 0.06401416 0.04917639 0.09327692 0.04845199 0.04917639\n", - " 0.03532407 0.03639862 0.04410714 0.04845199 0.04917639 0.03532407\n", - " 0.05244009 0.05244009 0.06401416 0.10994281 0.10026766 0.10026766\n", - " 0.10026766 0.03639862 0.05628307]\n", + "Pipeline fitting completed.\n", + "cv errors: [0.04375 0.06875 0.075 0.0875 0.10625 0.06875 0.0625 0.0625 0.0625\n", + " 0.05 0.05 ]\n", "Fitted an ensemble with size 5\n", - "having a capped running time of 40 seconds\n", - "cv errors: [0.03532407 0.04989418 0.05244009 0.05244009 0.06401416 0.03532407\n", - " 0.05244009 0.06401416 0.04917639 0.09327692 0.04845199 0.04917639\n", - " 0.03532407 0.03639862 0.04410714 0.04845199 0.04917639 0.03532407\n", - " 0.05244009 0.05244009 0.06401416 0.10994281 0.10026766 0.10026766\n", - " 0.10026766 0.03639862 0.05628307]\n", + "\n", + "AutoLearner fitting complete.\n", + "\n", + "Got a new ensemble in the round with runtime target 32.0 seconds\n", + "having a capped running time of 17 seconds\n", + "cv errors: [0.04375 0.06875 0.075 0.0875 0.10625 0.06875 0.0625 0.0625 0.0625\n", + " 0.05 0.05 ]\n", "Fitted an ensemble with size 5\n", - "cv errors: [0.03532407 0.04989418 0.05244009 0.05244009 0.06401416 0.03532407\n", - " 0.05244009 0.06401416 0.04917639 0.09327692 0.04845199 0.04917639\n", - " 0.03532407 0.03639862 0.04410714 0.04845199 0.04917639 0.03532407\n", - " 0.05244009 0.05244009 0.06401416 0.10994281 0.10026766 0.10026766\n", - " 0.10026766 0.03639862 0.05628307]\n", + "cv errors: [0.04375 0.06875 0.075 0.0875 0.10625 0.06875 0.0625 0.0625 0.0625\n", + " 0.05 0.05 ]\n", "Fitted an ensemble with size 5\n", - "cv errors: [0.03532407 0.04989418 0.05244009 0.05244009 0.06401416 0.03532407\n", - " 0.05244009 0.06401416 0.04917639 0.09327692 0.04845199 0.04917639\n", - " 0.03532407 0.03639862 0.04410714 0.04845199 0.04917639 0.03532407\n", - " 0.05244009 0.05244009 0.06401416 0.10994281 0.10026766 0.10026766\n", - " 0.10026766 0.03639862 0.05628307]\n", + "cv errors: [0.04375 0.06875 0.075 0.0875 0.10625 0.06875 0.0625 0.0625 0.0625\n", + " 0.05 0.05 ]\n", "Fitted an ensemble with size 5\n", - "cv errors: [0.03532407 0.04989418 0.05244009 0.05244009 0.06401416 0.03532407\n", - " 0.05244009 0.06401416 0.04917639 0.09327692 0.04845199 0.04917639\n", - " 0.03532407 0.03639862 0.04410714 0.04845199 0.04917639 0.03532407\n", - " 0.05244009 0.05244009 0.06401416 0.10994281 0.10026766 0.10026766\n", - " 0.10026766 0.03639862 0.05628307]\n", + "cv errors: [0.04375 0.06875 0.075 0.0875 0.10625 0.06875 0.0625 0.0625 0.0625\n", + " 0.05 0.05 ]\n", "Fitted an ensemble with size 5\n", - "cv errors: [0.03532407 0.04989418 0.05244009 0.05244009 0.06401416 0.03532407\n", - " 0.05244009 0.06401416 0.04917639 0.09327692 0.04845199 0.04917639\n", - " 0.03532407 0.03639862 0.04410714 0.04845199 0.04917639 0.03532407\n", - " 0.05244009 0.05244009 0.06401416 0.10994281 0.10026766 0.10026766\n", - " 0.10026766 0.03639862 0.05628307]\n", - "Fitted an ensemble with size 5\n" + "cv errors: [0.04375 0.06875 0.075 0.0875 0.10625 0.06875 0.0625 0.0625 0.0625\n", + " 0.05 0.05 ]\n", + "Fitted an ensemble with size 5\n", + "new approximate rank for the error tensor: (20, 4, 2, 2, 8, 19)\n" ] } ], "source": [ "# fit autolearner on training set and record runtime\n", "start = time.time()\n", - "m.fit(x_train, y_train, categorical) # TensorOboe accepts the list of feature types\n", + "m.fit(x_train, y_train, categorical=None) # TensorOboe accepts the list of feature types\n", "elapsed_time = time.time() - start" ] }, @@ -186,15 +176,15 @@ "name": "stdout", "output_type": "stream", "text": [ - "prediction error: 0.08421052631578946\n", - "elapsed time: 21.445039749145508\n" + "prediction error: 0.025000000000000022\n", + "elapsed time: 17.90514302253723\n" ] } ], "source": [ "# use the fitted autolearner for prediction on test set\n", "y_predicted = m.predict(x_test)\n", - "print(\"prediction error: {}\".format(util.error(y_test, y_predicted, 'classification'))) \n", + "print(\"prediction error: {}\".format(error(y_test, y_predicted, 'classification'))) \n", "print(\"elapsed time: {}\".format(elapsed_time))" ] }, @@ -208,36 +198,49 @@ "text/plain": [ "{'ensemble method': 'select at most 5 pipelines with smallest cv error',\n", " 'base learners': [{'imputer': {'algorithm': 'SimpleImputer',\n", - " 'hyperparameters': {'strategy': 'most_frequent'}},\n", - " 'encoder': {'algorithm': None},\n", - " 'standardizer': {'algorithm': None},\n", - " 'dim_reducer': {'algorithm': 'SelectKBest', 'hyperparameters': {'k': 1}},\n", - " 'estimator': {'algorithm': 'lSVM', 'hyperparameters': {'C': 2}}},\n", + " 'hyperparameters': {'strategy': 'median'}},\n", + " 'encoder': {'algorithm': 'OneHotEncoder',\n", + " 'hyperparameters': {'handle_unknown': 'ignore', 'sparse': 0}},\n", + " 'standardizer': {'algorithm': 'StandardScaler', 'hyperparameters': {}},\n", + " 'dim_reducer': {'algorithm': 'SelectKBest', 'hyperparameters': {'k': 3}},\n", + " 'estimator': {'algorithm': 'ExtraTrees',\n", + " 'hyperparameters': {'min_samples_split': 1e-05, 'criterion': 'entropy'}}},\n", " {'imputer': {'algorithm': 'SimpleImputer',\n", " 'hyperparameters': {'strategy': 'most_frequent'}},\n", " 'encoder': {'algorithm': None},\n", - " 'standardizer': {'algorithm': None},\n", + " 'standardizer': {'algorithm': 'StandardScaler', 'hyperparameters': {}},\n", " 'dim_reducer': {'algorithm': 'SelectKBest', 'hyperparameters': {'k': 1}},\n", - " 'estimator': {'algorithm': 'lSVM', 'hyperparameters': {'C': 2}}},\n", + " 'estimator': {'algorithm': 'GBT',\n", + " 'hyperparameters': {'learning_rate': 0.1,\n", + " 'max_depth': 3,\n", + " 'max_features': 'log2'}}},\n", " {'imputer': {'algorithm': 'SimpleImputer',\n", - " 'hyperparameters': {'strategy': 'most_frequent'}},\n", + " 'hyperparameters': {'strategy': 'median'}},\n", " 'encoder': {'algorithm': None},\n", - " 'standardizer': {'algorithm': None},\n", + " 'standardizer': {'algorithm': 'StandardScaler', 'hyperparameters': {}},\n", " 'dim_reducer': {'algorithm': 'SelectKBest', 'hyperparameters': {'k': 1}},\n", - " 'estimator': {'algorithm': 'lSVM', 'hyperparameters': {'C': 2}}},\n", + " 'estimator': {'algorithm': 'GBT',\n", + " 'hyperparameters': {'learning_rate': 0.1,\n", + " 'max_depth': 3,\n", + " 'max_features': 'log2'}}},\n", " {'imputer': {'algorithm': 'SimpleImputer',\n", " 'hyperparameters': {'strategy': 'most_frequent'}},\n", " 'encoder': {'algorithm': None},\n", - " 'standardizer': {'algorithm': None},\n", - " 'dim_reducer': {'algorithm': 'SelectKBest', 'hyperparameters': {'k': 1}},\n", - " 'estimator': {'algorithm': 'lSVM', 'hyperparameters': {'C': 2}}},\n", + " 'standardizer': {'algorithm': 'StandardScaler', 'hyperparameters': {}},\n", + " 'dim_reducer': {'algorithm': 'VarianceThreshold', 'hyperparameters': {}},\n", + " 'estimator': {'algorithm': 'GBT',\n", + " 'hyperparameters': {'learning_rate': 0.1,\n", + " 'max_depth': 3,\n", + " 'max_features': 'log2'}}},\n", " {'imputer': {'algorithm': 'SimpleImputer',\n", " 'hyperparameters': {'strategy': 'constant'}},\n", " 'encoder': {'algorithm': None},\n", " 'standardizer': {'algorithm': None},\n", - " 'dim_reducer': {'algorithm': 'SelectKBest', 'hyperparameters': {'k': 3}},\n", - " 'estimator': {'algorithm': 'Logit',\n", - " 'hyperparameters': {'C': 3, 'solver': 'liblinear', 'penalty': 'l2'}}}]}" + " 'dim_reducer': {'algorithm': 'VarianceThreshold', 'hyperparameters': {}},\n", + " 'estimator': {'algorithm': 'GBT',\n", + " 'hyperparameters': {'learning_rate': 0.1,\n", + " 'max_depth': 3,\n", + " 'max_features': 'log2'}}}]}" ] }, "execution_count": 6, @@ -288,7 +291,7 @@ "#autolearner arguments\n", "autolearner_kwargs = {\n", " 'p_type': 'classification',\n", - " 'method': 'TensorOboe',\n", + " 'method': method,\n", " 'runtime_limit': RUNTIME_BUDGET,\n", " 'verbose': VERBOSE,\n", " 'selection_method': 'min_variance',\n", @@ -310,40 +313,40 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "# fit autolearner on training set and record runtime\n", "start = time.time()\n", - "m.fit(x_train, y_train, categorical)\n", + "m.fit(x_train, y_train, categorical=None)\n", "elapsed_time = time.time() - start" ] }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "prediction error: 0.042105263157894736\n", - "elapsed time: 3.519043207168579\n" + "prediction error: 0.025000000000000022\n", + "elapsed time: 7.4095470905303955\n" ] } ], "source": [ "# use the fitted autolearner for prediction on test set\n", "y_predicted = m.predict(x_test)\n", - "print(\"prediction error: {}\".format(util.error(y_test, y_predicted, 'classification')))\n", + "print(\"prediction error: {}\".format(error(y_test, y_predicted, 'classification')))\n", "print(\"elapsed time: {}\".format(elapsed_time))" ] }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 14, "metadata": { "scrolled": true }, @@ -363,7 +366,7 @@ " 'max_features': 'log2'}}}]}" ] }, - "execution_count": 13, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } @@ -376,9 +379,9 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "test", "language": "python", - "name": "python3" + "name": "test" }, "language_info": { "codemirror_mode": { @@ -390,7 +393,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.3" + "version": "3.8.11" } }, "nbformat": 4, diff --git a/large_files/error_tensor_f16_compressed.npz b/large_files/error_tensor_f16_compressed.npz new file mode 100644 index 0000000..4523439 Binary files /dev/null and b/large_files/error_tensor_f16_compressed.npz differ diff --git a/oboe/defaults/TensorOboe/error_tensor_part_1.npy b/large_files/error_tensor_part_1.npy similarity index 100% rename from oboe/defaults/TensorOboe/error_tensor_part_1.npy rename to large_files/error_tensor_part_1.npy diff --git a/oboe/defaults/TensorOboe/error_tensor_part_2.npy b/large_files/error_tensor_part_2.npy similarity index 100% rename from oboe/defaults/TensorOboe/error_tensor_part_2.npy rename to large_files/error_tensor_part_2.npy diff --git a/oboe/__init__.py b/oboe/__init__.py index 32d3a1c..3bb1b4c 100644 --- a/oboe/__init__.py +++ b/oboe/__init__.py @@ -1,2 +1,7 @@ from oboe.auto_learner import AutoLearner -from oboe.util import error \ No newline at end of file +from oboe.util import error + +# add the path of oboe files to sys.path, so as to load the pickle file of runtime predictors +import sys +from pathlib import Path +sys.path.append(str(Path(__file__).parent.absolute())) diff --git a/oboe/auto_learner.py b/oboe/auto_learner.py index 81fc098..42d901a 100644 --- a/oboe/auto_learner.py +++ b/oboe/auto_learner.py @@ -22,7 +22,7 @@ from .ensemble import Ensemble, Model_collection from . import experiment_design as ED - + class AutoLearner: """An object that automatically selects pipelines by greedy D-optimal design. @@ -37,6 +37,7 @@ class AutoLearner: fit_ensemble_despite_timeout (bool): Whether to still fit the ensemble despite a fitting timeout. Advanced attributes: + mode (str): The running mode of TensorOboe. new_row (np.ndarray): Predicted row of matricized error tensor, corresponding to the new dataset. Default None. dataset_ratio_threshold(float):The threshold of dataset ratio (number of points / number of features) for dataset subsampling, if the training set is tall and skinny (number of data points much larger than number of features). runtime_predictor_algorithm (str): @@ -55,19 +56,65 @@ class AutoLearner: """ def __init__(self, - p_type='classification', method='Oboe', algorithms=None, hyperparameters=None, verbose=False, - n_cores=1, n_folds=5, runtime_limit=512, dataset_ratio_threshold=100, - new_row=None, load_defaults=True, customized_defaults_path='', - load_imputed_error_tensor=True, path_to_imputed_error_tensor='default', save_imputed_error_tensor=True, + p_type='classification', + method='Oboe', + mode='warm', + algorithms=None, + hyperparameters=None, + verbose=False, + n_cores=1, + n_folds=5, + runtime_limit=512, + dataset_ratio_threshold=100, + new_row=None, + load_defaults=True, + customized_defaults_path='', + load_imputed_error_tensor=True, + original_error_tensor_dir=None, + path_to_imputed_error_tensor='default', + save_imputed_error_tensor=True, selection_method='ED', scalarization='D', - build_ensemble=True, load_saved_latent_factors=True, save_latent_factors=True, - ensemble_method='best_several', ensemble_max_size=5, runtime_predictor_algorithm='LinearRegression', - load_saved_runtime_predictors=True, save_fitted_runtime_predictors=False, random_state=0, - fit_ensemble_despite_timeout=True, **stacking_hyperparams): + build_ensemble=True, + load_saved_latent_factors=True, + save_latent_factors=True, + ensemble_method='best_several', + ensemble_max_size=5, + runtime_predictor_algorithm='LinearRegression', + load_saved_runtime_predictors=True, + save_fitted_runtime_predictors=False, + random_state=0, + fit_ensemble_despite_timeout=True, + **stacking_hyperparams): method = method.lower() assert method in {'oboe', 'tensoroboe'}, "The method must be one of {'Oboe', 'TensorOboe'}." self.method = method + + if method == 'tensoroboe': + if mode == 'initialize': # impute error tensor, compute low rank factors, fit runtime predictors + load_imputed_error_tensor=False + save_imputed_error_tensor=True + load_saved_latent_factors=False + load_saved_runtime_predictors=False + save_fitted_runtime_predictors=True + elif mode == 'impute': # impute error tensor, compute low rank factors, use fitted runtime predictors + load_imputed_error_tensor=False + save_imputed_error_tensor=True + load_saved_latent_factors=False + load_saved_runtime_predictors=True + save_fitted_runtime_predictors=False + elif mode == 'factorize': # used imputed error tensor, compute low rank factors, use fitted runtime predictors + load_imputed_error_tensor=True + save_imputed_error_tensor=False + load_saved_latent_factors=False + load_saved_runtime_predictors=True + save_fitted_runtime_predictors=False + elif mode == 'warm': # used imputed error tensor, use low rank factors, use fitted runtime predictors + load_imputed_error_tensor=True + save_imputed_error_tensor=False + load_saved_latent_factors=True + load_saved_runtime_predictors=True + save_fitted_runtime_predictors=False self.verbose = verbose self.random_state = random_state @@ -146,14 +193,18 @@ def __init__(self, if self.verbose: print("created a tmp directory in DEFAULTS ...") - # ERROR_TENSOR = np.load(os.path.join(DEFAULTS, 'error_tensor.npy')) - # RUNTIME_TENSOR = np.load(os.path.join(DEFAULTS, 'runtime_tensor.npy')) - ERROR_TENSOR = np.vstack((np.load(os.path.join(DEFAULTS, 'error_tensor_part_1.npy')), - np.load(os.path.join(DEFAULTS, 'error_tensor_part_2.npy')))) - if verbose: - print("shape of the error tensor: {}".format(ERROR_TENSOR.shape)) - RUNTIME_TENSOR = np.vstack((np.load(os.path.join(DEFAULTS, 'runtime_tensor_part_1.npy')), - np.load(os.path.join(DEFAULTS, 'runtime_tensor_part_2.npy')))) + if not load_imputed_error_tensor: + assert original_error_tensor_dir != None + if self.verbose: + print("loading Float16 not-imputed error tensor ...") + # print("loading original Float64 error tensor ...") + + ERROR_TENSOR = np.float64(np.load(os.path.join(original_error_tensor_dir, 'error_tensor_f16_compressed.npz'))['a']) + # ERROR_TENSOR = np.vstack((np.load(os.path.join(original_error_tensor_dir, 'error_tensor_part_1.npy')), + # np.load(os.path.join(original_error_tensor_dir, 'error_tensor_part_2.npy')))) + + RUNTIME_TENSOR = np.float64(np.load(os.path.join(DEFAULTS, 'runtime_tensor_f16_compressed.npz'))['a']) + with open(os.path.join(DEFAULTS, 'training_index.pkl'), 'rb') as handle: TRAINING_INDEX = pickle.load(handle) @@ -172,21 +223,30 @@ def __init__(self, self.n_folds = n_folds # error tensor completion - ranks_for_imputation = (20, 4, 2, 2, 8, 20) + rank_for_imputation = (20, 4, 2, 2, 8, 20) + if verbose: + print("rank for EM-Tucker imputation: {}".format(rank_for_imputation)) + rank_tuple = '-'.join([str(item) for item in rank_for_imputation]) if load_imputed_error_tensor: try: if path_to_imputed_error_tensor == 'default': - error_tensor_imputed = np.load(os.path.join(DEFAULTS, 'tmp', 'error_tensor_imputed.npy')) + error_tensor_imputed = np.float64(np.load(os.path.join(DEFAULTS, 'error_tensor_imputed_20-4-2-2-8-20_f16_compressed.npz'))['a']) else: if self.verbose: print("loading customized tensor at {} ...".format(path_to_imputed_error_tensor)) error_tensor_imputed = np.load(path_to_imputed_error_tensor) except: - print("no files!") + print("Error loading imputed error tensor!") else: - _, _, error_tensor_imputed, _ = tucker_on_error_tensor(ERROR_TENSOR, ranks_for_imputation, save_results=False, verbose=self.verbose) + _, _, error_tensor_imputed, _ = tucker_on_error_tensor(ERROR_TENSOR, rank_for_imputation, save_results=False, verbose=self.verbose) if save_imputed_error_tensor: - np.save(os.path.join(DEFAULTS, 'tmp', 'error_tensor_imputed.npy'), error_tensor_imputed) + imputed_error_tensor_save_path = os.path.join(DEFAULTS, 'tmp', 'error_tensor_imputed_{}.npy'.format(rank_tuple)) + np.save(imputed_error_tensor_save_path, error_tensor_imputed) + if self.verbose: + print("saved imputed error tensor to {}".format(imputed_error_tensor_save_path)) + + if verbose: + print("shape of the error tensor: {}".format(error_tensor_imputed.shape)) self.error_tensor_imputed = error_tensor_imputed @@ -204,6 +264,7 @@ def __init__(self, Vt_t = np.load(os.path.join(DEFAULTS, 'tmp', 'error_tensor_Vt_t.npy')) factorize_error_tensor = False except: + factorize_error_tensor = True if self.verbose: print("No saved latent factors. Factorizing the error tensor now ...") else: @@ -212,15 +273,15 @@ def __init__(self, if factorize_error_tensor: if self.verbose: print("Factorizing the error matrix to get latent factors ...") - core_tr, factors_tr = tl.decomposition.tucker(error_tensor_imputed, ranks=(k_dataset_for_factorization, 4, 2, 2, 8, k_estimator_for_factorization)) + core_tr, factors_tr = tl.decomposition.tucker(error_tensor_imputed, rank=(k_dataset_for_factorization, 4, 2, 2, 8, k_estimator_for_factorization)) pipeline_latent_factors = tl.unfold(tl.tenalg.multi_mode_dot(core_tr, factors_tr[1:], modes=[1, 2, 3, 4, 5]), mode=0) U_t, S_t, Vt_t = sp.linalg.svd(pipeline_latent_factors, full_matrices=False) if save_latent_factors: - if self.verbose: - print("Saving latent factors ...") np.save(os.path.join(DEFAULTS, 'tmp', 'error_tensor_U_t.npy'), U_t) np.save(os.path.join(DEFAULTS, 'tmp', 'error_tensor_S_t.npy'), S_t) np.save(os.path.join(DEFAULTS, 'tmp', 'error_tensor_Vt_t.npy'), Vt_t) + if self.verbose: + print("latent factors saved to {}".format(os.path.join(DEFAULTS, 'tmp'))) else: if self.verbose: print("Loading latent factors from storage ...") @@ -322,7 +383,7 @@ def _fit(self, x_train, y_train, categorical, t_predicted, ranks=None, runtime_l return start = time.time() - if self.selection_method is not 'random': + if self.selection_method != 'random': # we only need to fit models on the new dataset if it has not been fitted already to_sample = list(set(to_sample) - self.sampled_indices) if self.verbose: @@ -463,7 +524,7 @@ def _fit(self, x_train, y_train, categorical, t_predicted, ranks=None, runtime_l return start = time.time() - if self.selection_method is not 'random': + if self.selection_method != 'random': candidate_indices = [] # we only need to fit models on the new dataset if it has not been fitted already to_sample = list(set(to_sample) - self.sampled_indices) @@ -506,7 +567,7 @@ def _fit(self, x_train, y_train, categorical, t_predicted, ranks=None, runtime_l print(self.sampled_pipelines[idx]) self.ensemble.candidate_learners.append(self.sampled_pipelines[idx]) - # impute ALL entries + # currently disabled: impute ALL entries # unknown = sorted(list(set(range(self.new_row.shape[1])) - self.sampled_indices)) # self.new_row[:, unknown] = imputed[:, unknown] @@ -523,9 +584,7 @@ def _fit(self, x_train, y_train, categorical, t_predicted, ranks=None, runtime_l # self.ensemble.candidate_learners.append(self.sampled_pipelines[best_sampled_idx]) for i in np.argsort(self.new_row_pred[0]): - if (first and len(candidate_indices) <= 3) or t_predicted[i] + t_predicted[candidate_indices].sum() <= remaining / 4: - # if self.verbose: - # print("Adding models predicted to be the best to the ensemble ...") + if (first and len(candidate_indices) <= 3) or t_predicted[i] + t_predicted[candidate_indices].sum() <= remaining / 4: candidate_indices.append(i) # if model has already been k-fold fitted, immediately add to candidate learners if i in self.sampled_indices: @@ -598,12 +657,10 @@ def _fit(self, x_train, y_train, categorical, t_predicted, ranks=None, runtime_l print("Insufficient time in this round.") - def fit(self, x_train, y_train, categorical=None, verbose=False): + def fit(self, x_train, y_train, categorical=None): """Fit an AutoLearner object, iteratively doubling allowed runtime, and terminate when reaching the time limit.""" - self.verbose = verbose - if self.method == 'oboe': num_points, num_features = x_train.shape @@ -650,7 +707,7 @@ def doubling(): k, t = ranks[0], times[0] counter, self.best = 0, 0 while time.time() - start < self.runtime_limit - t: - if verbose: + if self.verbose: print('Fitting with k={}, t={}'.format(k, t)) # if self.build_ensemble: # self.ensemble = Ensemble(self.p_type, self.ensemble_method, self.stacking_hyperparams) @@ -769,7 +826,7 @@ def p2f(x): t_init = max(2**np.floor(np.log2(self.runtime_limit/8)), t_init) if self.verbose: - print("Runtime limit of initial round: {}".format(t_init)) + print("runtime limit of initial round: {} seconds".format(t_init)) times = [t_init] losses = [0.5] @@ -792,7 +849,7 @@ def doubling(): k, t = ranks[0], times[0] counter, self.best = 0, 0 while time.time() - start < self.runtime_limit - t: - if verbose: + if self.verbose: print('Fitting with ranks={}, t={}'.format(k, t)) # if self.build_ensemble: # self.ensemble = Ensemble(self.p_type, self.ensemble_method, self.stacking_hyperparams) @@ -806,7 +863,6 @@ def doubling(): loss = self.ensemble.kfold_fit_validate(x_va, y_va, categorical, n_folds=self.n_folds, timeout=(self.runtime_limit-time.time()+start)/2)[0] # TEMPORARY: Record intermediate results - e_hat.append(np.copy(self.new_row)) e_hat_pred.append(np.copy(self.new_row_pred)) actual_times.append(time.time() - start) @@ -896,11 +952,16 @@ def predict(self, x_test): else: # just select a collection of promising models return self.ensemble.predict(x_test) # the self.ensemble object here is a Model_collection + def _predict_runtime(self, x_train): + # predict runtime for the training set of the new dataset. + if self.verbose: + print("Predicting pipeline running time ..") + return convex_opt.predict_runtime(x_train.shape, saved_model='Class', model=self.runtime_predictor) + # The code below is deprecated def refit(self, x_train, y_train): """Refit an existing AutoLearner object on a new dataset. This will simply retrain the base-learners and stacked learner of an existing model, and so algorithm and hyperparameter selection may not be optimal. - Args: x_train (np.ndarray): Features of the training dataset. y_train (np.ndarray): Labels of the training dataset. @@ -925,12 +986,6 @@ def get_model_accuracies(self, y_test): """ return self.get_pipeline_accuracies(y_test) - def _predict_runtime(self, x_train): - # predict runtime for the training set of the new dataset. - if self.verbose: - print("Predicting pipeline running time ..") - return convex_opt.predict_runtime(x_train.shape, saved_model='Class', model=self.runtime_predictor) - def _greedy_initial_selection(self, x_train, y_train, t_predicted, runtime_limit): if self.verbose: print("Fitting fast pipelines that perform well on average.") @@ -970,5 +1025,4 @@ def _greedy_initial_selection(self, x_train, y_train, t_predicted, runtime_limit self.ensemble.fit(x_train, y_train) else: if self.verbose: - print("Insufficient time to fit fast and on average best-performing pipelines.") - + print("Insufficient time to fit fast and on average best-performing pipelines.") \ No newline at end of file diff --git a/oboe/convex_opt.py b/oboe/convex_opt.py index 2a694f1..17c7cfa 100755 --- a/oboe/convex_opt.py +++ b/oboe/convex_opt.py @@ -35,7 +35,7 @@ def initialize_runtime_predictor(runtime_matrix, runtimes_index, model_name='Lin sizes_index = [] sizes = [] if runtime_matrix is None: - runtime_tensor = pd.read_csv(os.path.join(defaults_path, 'runtime_tensor.csv'), index_col=0) + runtime_tensor = np.float64(np.load(os.path.join(defaults_path, 'runtime_tensor_f16_compressed.npz'))['a']) runtime_matrix = tl.unfold(runtime_tensor, mode=0) if runtimes_index is None: @@ -126,8 +126,7 @@ def fit(self, sizes, sizes_index, runtimes, runtimes_index): for i in range(self.n_models): runtime = runtimes[:, i] no_nan_indices = np.where(np.invert(np.isnan(runtime)))[0] - runtime_no_nan = runtime[no_nan_indices] - + runtime_no_nan = runtime[no_nan_indices] if self.model_name == 'LinearRegression': sizes_train_poly_no_nan = sizes_train_poly[no_nan_indices] @@ -143,8 +142,6 @@ def weights(distances): neigh = KNeighborsRegressor(n_neighbors=5, metric=metric, weights=weights) self.models[i] = neigh.fit(sizes_train_no_nan, runtime_no_nan) -# print(self.models[i].coef_) -# print(self.models[i].intercept_) # self.models[i] = Lasso().fit(sizes_train_poly, runtime) def predict(self, size): diff --git a/oboe/defaults/TensorOboe/error_tensor_imputed_20-4-2-2-8-20_f16_compressed.npz b/oboe/defaults/TensorOboe/error_tensor_imputed_20-4-2-2-8-20_f16_compressed.npz new file mode 100644 index 0000000..6bb97cd Binary files /dev/null and b/oboe/defaults/TensorOboe/error_tensor_imputed_20-4-2-2-8-20_f16_compressed.npz differ diff --git a/oboe/defaults/TensorOboe/runtime_tensor_f16_compressed.npz b/oboe/defaults/TensorOboe/runtime_tensor_f16_compressed.npz new file mode 100644 index 0000000..e4cc3ef Binary files /dev/null and b/oboe/defaults/TensorOboe/runtime_tensor_f16_compressed.npz differ diff --git a/oboe/defaults/TensorOboe/runtime_tensor_part_1.npy b/oboe/defaults/TensorOboe/runtime_tensor_part_1.npy deleted file mode 100644 index b8fb73b..0000000 Binary files a/oboe/defaults/TensorOboe/runtime_tensor_part_1.npy and /dev/null differ diff --git a/oboe/defaults/TensorOboe/runtime_tensor_part_2.npy b/oboe/defaults/TensorOboe/runtime_tensor_part_2.npy deleted file mode 100644 index fd626ec..0000000 Binary files a/oboe/defaults/TensorOboe/runtime_tensor_part_2.npy and /dev/null differ diff --git a/oboe/defaults/TensorOboe/tmp/error_tensor_S_t.npy b/oboe/defaults/TensorOboe/tmp/error_tensor_S_t.npy index c2475f7..a7572f2 100644 Binary files a/oboe/defaults/TensorOboe/tmp/error_tensor_S_t.npy and b/oboe/defaults/TensorOboe/tmp/error_tensor_S_t.npy differ diff --git a/oboe/defaults/TensorOboe/tmp/error_tensor_U_t.npy b/oboe/defaults/TensorOboe/tmp/error_tensor_U_t.npy index cbcb871..f9d5c9a 100644 Binary files a/oboe/defaults/TensorOboe/tmp/error_tensor_U_t.npy and b/oboe/defaults/TensorOboe/tmp/error_tensor_U_t.npy differ diff --git a/oboe/defaults/TensorOboe/tmp/error_tensor_Vt_t.npy b/oboe/defaults/TensorOboe/tmp/error_tensor_Vt_t.npy index 8b191eb..4b661b2 100644 Binary files a/oboe/defaults/TensorOboe/tmp/error_tensor_Vt_t.npy and b/oboe/defaults/TensorOboe/tmp/error_tensor_Vt_t.npy differ diff --git a/oboe/defaults/TensorOboe/tmp/error_tensor_imputed.npy b/oboe/defaults/TensorOboe/tmp/error_tensor_imputed.npy deleted file mode 100644 index d8c79a1..0000000 Binary files a/oboe/defaults/TensorOboe/tmp/error_tensor_imputed.npy and /dev/null differ diff --git a/oboe/util.py b/oboe/util.py index 8a73b86..64d0b53 100644 --- a/oboe/util.py +++ b/oboe/util.py @@ -61,8 +61,7 @@ def get_omega(tensor): Ω[index] = 0 return Ω -def tucker_on_error_tensor(error_tensor, ranks=[15, 4, 2, 2, 8, 15], save_results=False, save_path='', verbose=False): - +def tucker_on_error_tensor(error_tensor, rank=[15, 4, 2, 2, 8, 15], save_results=False, save_path='', verbose=False): tensor_pred = np.nan_to_num(error_tensor) tensor_from_fac = np.zeros(error_tensor.shape) errors = [] @@ -72,18 +71,18 @@ def tucker_on_error_tensor(error_tensor, ranks=[15, 4, 2, 2, 8, 15], save_result # while(not stopping_condition(tensor, tensor_from_fac, threshold)): while((len(errors) <= 2 or (errors[-2] - errors[-1])/errors[-2] >= 0.0001) and num_iterations <= 1000): num_iterations += 1 - core, factors = tucker(tensor_pred, ranks=ranks) + core, factors = tucker(tensor_pred, rank=rank) tensor_from_fac = tucker_to_tensor((core, factors)) error = np.linalg.norm(np.multiply(Ω, np.nan_to_num(error_tensor - tensor_from_fac))) if verbose: if not num_iterations % 5: - print("ranks: {}, iteration {}, error: {}".format(ranks, num_iterations, error)) + print("rank: {}, iteration {}, error: {}".format(rank, num_iterations, error)) errors.append(error) tensor_pred = np.nan_to_num(error_tensor) + np.multiply(1-Ω, tensor_from_fac) - core, factors = tucker(tensor_pred, ranks=ranks) + core, factors = tucker(tensor_pred, rank=rank) if save_results: np.save(os.path.join(save_path, 'error_tensor_imputed.npy'), tensor_pred) diff --git a/setup.py b/setup.py index 72af5a2..a6f6c01 100644 --- a/setup.py +++ b/setup.py @@ -1,4 +1,5 @@ import setuptools +import os with open("README.md", "r", encoding="utf-8") as fh: long_description = fh.read() @@ -20,7 +21,7 @@ def package_files(directory): "scipy>=1.4.1", "pandas>=0.24.2", "scikit-learn>=0.22.1", - "tensorly>=0.4.4", + "tensorly==0.6.0", "OpenML>=0.9.0", "mkl>=1.0.0", ], @@ -28,7 +29,7 @@ def package_files(directory): setuptools.setup( name="oboe", - version="0.0.4", + version="0.2.0", author="Chengrun Yang, Yuji Akimoto, Dae Won Kim, Madeleine Udell", author_email="cy438@cornell.edu", description="An AutoML pipeline selection system to quickly select a promising pipeline for a new dataset.", @@ -44,8 +45,8 @@ def package_files(directory): "License :: OSI Approved :: BSD License", "Operating System :: OS Independent", ], - packages=setuptools.find_packages(), + packages=setuptools.find_packages(exclude='large_files'), package_data={'': package_files('oboe/defaults')}, install_requires=install_requires, python_requires=">=3.7", -) \ No newline at end of file +)