Skip to content

Commit

Permalink
Merge pull request #228 from rhiever/development
Browse files Browse the repository at this point in the history
Pull over 0.5 release
  • Loading branch information
rhiever authored Aug 20, 2016
2 parents 427b2fc + 7e142a2 commit 6f63e83
Show file tree
Hide file tree
Showing 86 changed files with 3,176 additions and 3,512 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -71,3 +71,5 @@ docs/sources/examples/.Rhistory

# PyCharm
.idea

analyze-oj2-tpot-mdr.ipynb
21 changes: 12 additions & 9 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -70,21 +70,24 @@ tpot.export('tpot_mnist_pipeline.py')
Running this code should discover a pipeline that achieves ~98% testing accuracy, and the corresponding Python code should be exported to the `tpot_mnist_pipeline.py` file and look similar to the following:

```python
import pandas as pd
import numpy as np

from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline

# NOTE: Make sure that the class is labeled 'class' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR')
training_indices, testing_indices = train_test_split(tpot_data.index, stratify = tpot_data['class'].values, train_size=0.75, test_size=0.25)
tpot_data = np.recfromcsv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR')
features = tpot_data.view((np.float64, len(tpot_data.dtype.names)))
features = np.delete(features, tpot_data.dtype.names.index('class'), axis=1)
training_features, testing_features, training_classes, testing_classes = train_test_split(features, tpot_data['class'], random_state=42)

result1 = tpot_data.copy()
exported_pipeline = make_pipeline(
KNeighborsClassifier(n_neighbors=3, weights="uniform")
)

# Perform classification with a random forest classifier
rfc1 = RandomForestClassifier(n_estimators=200, max_features=min(64, len(result1.columns) - 1))
rfc1.fit(result1.loc[training_indices].drop('class', axis=1).values, result1.loc[training_indices, 'class'].values)
result1['rfc1-classification'] = rfc1.predict(result1.drop('class', axis=1).values)
exported_pipeline.fit(training_features, training_classes)
results = exported_pipeline.predict(testing_features)
```

## Contributing to TPOT
Expand Down
6 changes: 2 additions & 4 deletions ci/.travis_install.sh
Original file line number Diff line number Diff line change
Expand Up @@ -33,13 +33,12 @@ conda update --yes conda
# provided versions
if [[ "$LATEST" == "true" ]]; then
conda create -n testenv --yes python=$PYTHON_VERSION pip nose \
numpy scipy scikit-learn cython pandas
numpy scipy scikit-learn cython
else
conda create -n testenv --yes python=$PYTHON_VERSION pip nose \
numpy=$NUMPY_VERSION scipy=$SCIPY_VERSION \
scikit-learn=$SKLEARN_VERSION \
pandas=$PANDAS_VERSION \
cython
cython
fi

source activate testenv
Expand All @@ -62,7 +61,6 @@ python --version
python -c "import numpy; print('numpy %s' % numpy.__version__)"
python -c "import scipy; print('scipy %s' % scipy.__version__)"
python -c "import sklearn; print('sklearn %s' % sklearn.__version__)"
python -c "import pandas; print('pandas %s' % pandas.__version__)"
python -c "import deap; print('deap %s' % deap.__version__)"
python -c "import update_checker; print('update_checker %s' % update_checker.__version__)"
python -c "import tqdm; print('tqdm %s' % tqdm.__version__)"
Expand Down
1 change: 0 additions & 1 deletion ci/.travis_test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@ python --version
python -c "import numpy; print('numpy %s' % numpy.__version__)"
python -c "import scipy; print('scipy %s' % scipy.__version__)"
python -c "import sklearn; print('sklearn %s' % sklearn.__version__)"
python -c "import pandas; print('pandas %s' % pandas.__version__)"
python -c "import deap; print('deap %s' % deap.__version__)"
python -c "import update_checker; print('update_checker %s ' % update_checker.__version__)"
python -c "import tqdm; print('tqdm %s' % tqdm.__version__)"
Expand Down
1 change: 1 addition & 0 deletions docs/mkdocs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ pages:
- Feature Selection:
- VarianceThreshold: documentation/pipeline_operators/feature_selection/VarianceThreshold.md
- SelectKBest: documentation/pipeline_operators/feature_selection/SelectKBest.md
- SelectFwe: documentation/pipeline_operators/feature_selection/SelectFwe.md
- SelectPercentile: documentation/pipeline_operators/feature_selection/SelectPercentile.md
- RFE: documentation/pipeline_operators/feature_selection/RFE.md
- Decomposition:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,8 @@ Uses Scikit-learn's FeatureAgglomeration to transform the feature set.

Parameters
----------
input_df: pandas.DataFrame {n_samples, n_features+['class', 'group', 'guess']}
input_df: numpy.ndarray {n_samples, n_features+['class', 'group', 'guess']}
Input DataFrame to scale
n_clusters: int
The number of clusters to find.
affinity: int
Metric used to compute the linkage. Can be "euclidean", "l1", "l2",
"manhattan", "cosine", or "precomputed". If linkage is "ward", only
Expand All @@ -25,43 +23,35 @@ Parameters

Returns
-------
modified_df: pandas.DataFrame {n_samples, n_components + ['guess', 'group', 'class']}
modified_df: numpy.ndarray {n_samples, n_components + ['guess', 'group', 'class']}
Returns a DataFrame containing the transformed features

Example Exported Code
---------------------

```Python
import numpy as np
import pandas as pd

from sklearn.cluster import FeatureAgglomeration
from sklearn.tree import DecisionTreeClassifier
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import VotingClassifier
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import FunctionTransformer
from sklearn.tree import DecisionTreeClassifier

# NOTE: Make sure that the class is labeled 'class' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR')
training_indices, testing_indices = train_test_split(tpot_data.index, stratify = tpot_data['class'].values, train_size=0.75, test_size=0.25)

result1 = tpot_data.copy()
tpot_data = np.recfromcsv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR')

# Use Scikit-learn's FeatureAgglomeration to transform the feature set
training_features = result1.loc[training_indices].drop('class', axis=1)
features = tpot_data.view((np.float64, len(tpot_data.dtype.names)))
features = np.delete(features, tpot_data.dtype.names.index('class'), axis=1)
training_features, testing_features, training_classes, testing_classes = train_test_split(features, tpot_data['class'], random_state=42)

if len(training_features.columns.values) > 0:
# FeatureAgglomeration must be fit on only the training data
fa = FeatureAgglomeration(n_clusters=51, affinity='euclidean', linkage='complete')
fa.fit(training_features.values.astype(np.float64))
transformed_features = fa.transform(result1.drop('class', axis=1).values.astype(np.float64))
result1 = pd.DataFrame(data=transformed_features)
result1['class'] = result1['class'].values
else:
result1 = result1.copy()
exported_pipeline = make_pipeline(
FeatureAgglomeration(affinity="euclidean", linkage="ward"),
DecisionTreeClassifier(min_weight_fraction_leaf=0.5)
)

# Perform classification with a decision tree classifier
dtc2 = DecisionTreeClassifier(max_features=min(145, len(result1.columns) - 1), max_depth=2835)
dtc2.fit(result1.loc[training_indices].drop('class', axis=1).values, result1.loc[training_indices, 'class'].values)
result2 = result1.copy()
result2['dtc2-classification'] = dtc2.predict(result2.drop('class', axis=1).values)
exported_pipeline.fit(training_features, training_classes)
results = exported_pipeline.predict(testing_features)

```
Original file line number Diff line number Diff line change
Expand Up @@ -9,51 +9,41 @@ Uses Scikit-learn's FastICA to transform the feature set.

Parameters
----------
input_df: pandas.DataFrame {n_samples, n_features+['class', 'group', 'guess']}
input_df: numpy.ndarray {n_samples, n_features+['class', 'group', 'guess']}
Input DataFrame to scale
tol: float
Tolerance on update at each iteration.

Returns
-------
modified_df: pandas.DataFrame {n_samples, n_components + ['guess', 'group', 'class']}
modified_df: numpy.ndarray {n_samples, n_components + ['guess', 'group', 'class']}
Returns a DataFrame containing the transformed features


Example Exported Code
---------------------

```Python
import numpy as np
import pandas as pd
from sklearn.cross_validation import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.decomposition import FastICA
from sklearn.ensemble import VotingClassifier
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import FunctionTransformer
from sklearn.tree import DecisionTreeClassifier

# NOTE: Make sure that the class is labeled 'class' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR')
training_indices, testing_indices = train_test_split(tpot_data.index, stratify=tpot_data['class'].values, train_size=0.75, test_size=0.25)


# Use Scikit-learn's FastICA to transform the feature set
training_features = {INPUT_DF}.loc[training_indices].drop('class', axis=1)

if len(training_features.columns.values) > 0:
# FastICA must be fit on only the training data
ica = FastICA(tol=0.1, random_state=42)
ica.fit(training_features.values.astype(np.float64))
transformed_features = ica.transform(tpot_data.drop('class', axis=1).values.astype(np.float64))
result1 = pd.DataFrame(data=transformed_features)
result1['class'] = tpot_data['class'].values
else:
result1 = tpot_data.copy()
input_data = np.recfromcsv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)
features = np.delete(input_data.view(np.float64).reshape(input_data.size, -1), input_data.dtype.names.index('class'), axis=1)
training_features, testing_features, training_classes, testing_classes =\
train_test_split(features, tpot_data['class'], random_state=42)

# Perform classification with a decision tree classifier
result2 = result1.copy()

dtc1 = DecisionTreeClassifier(max_features='auto', max_depth=None)
dtc1.fit(result2.loc[training_indices].drop('class', axis=1).values, result2.loc[training_indices, 'class'].values)
exported_pipeline = make_pipeline(
FastICA(tol=0.96),
DecisionTreeClassifier(min_weight_fraction_leaf=0.5)
)

result2['dtc1-classification'] = dtc1.predict(result2.drop('class', axis=1).values)
exported_pipeline.fit(training_features, training_classes)
results = exported_pipeline.predict(testing_features)

```
Original file line number Diff line number Diff line change
Expand Up @@ -9,53 +9,41 @@ Uses Scikit-learn's RandomizedPCA to transform the feature set.

Parameters
----------
input_df: pandas.DataFrame {n_samples, n_features+['class', 'group', 'guess']}
input_df: numpy.ndarray {n_samples, n_features+['class', 'group', 'guess']}
Input DataFrame to scale
iterated_power: int
Number of iterations for the power method. [1, 10]

Returns
-------
modified_df: pandas.DataFrame {n_samples, n_components + ['guess', 'group', 'class']}
modified_df: numpy.ndarray {n_samples, n_components + ['guess', 'group', 'class']}
Returns a DataFrame containing the transformed features


Example Exported Code
---------------------

```Python
import numpy as np
import pandas as pd
from sklearn.cross_validation import train_test_split
from sklearn.decomposition import FastICA
from sklearn.ensemble import VotingClassifier
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import FunctionTransformer
from sklearn.tree import DecisionTreeClassifier
from sklearn.decomposition import RandomizedPCA

# NOTE: Make sure that the class is labeled 'class' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR')
training_indices, testing_indices = train_test_split(tpot_data.index, stratify=tpot_data['class'].values, train_size=0.75, test_size=0.25)
input_data = np.recfromcsv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)
features = np.delete(input_data.view(np.float64).reshape(input_data.size, -1), input_data.dtype.names.index('class'), axis=1)
training_features, testing_features, training_classes, testing_classes =\
train_test_split(features, tpot_data['class'], random_state=42)


# Use Scikit-learn's RandomizedPCA to transform the feature set
training_features = tpot_data.loc[training_indices].drop('class', axis=1)
exported_pipeline = make_pipeline(
RandomizedPCS(iterated_power=5),
DecisionTreeClassifier(min_weight_fraction_leaf=0.5)
)

if len(training_features.columns.values) > 0:
# RandomizedPCA must be fit on only the training data
pca = RandomizedPCA(iterated_power=10)
pca.fit(training_features.values.astype(np.float64))
transformed_features = pca.transform(tpot_data.drop('class', axis=1).values.astype(np.float64))

tpot_data_classes = tpot_data['class'].values
result1 = pd.DataFrame(data=transformed_features)
result1['class'] = tpot_data_classes
else:
result1 = tpot_data.copy()

# Perform classification with a decision tree classifier
result2 = result1.copy()

dtc1 = DecisionTreeClassifier(max_features='auto', max_depth=None)
dtc1.fit(result2.loc[training_indices].drop('class', axis=1).values, result2.loc[training_indices, 'class'].values)

result2['dtc1-classification'] = dtc1.predict(result2.drop('class', axis=1).values)
exported_pipeline.fit(training_features, training_classes)
results = exported_pipeline.predict(testing_features)

```
Original file line number Diff line number Diff line change
@@ -1,16 +1,16 @@
# Recursive Feature Elimination
* * *
* * *

Uses Scikit-learn's Recursive Feature Elimintation to learn the subset of features that have the highest weights according to the estimator.

## Dependencies
## Dependencies
sklearn.feature_selection.RFE
sklearn.svm.SVC


Parameters
----------
input_df: pandas.DataFrame {n_samples, n_features+['class', 'group', 'guess']}
input_df: numpy.ndarray {n_samples, n_features+['class', 'group', 'guess']}
Input DataFrame to perform feature selection on
num_features: int
The number of features to select
Expand All @@ -19,45 +19,38 @@ Parameters

Returns
-------
subsetted_df: pandas.DataFrame {n_samples, n_filtered_features + ['guess', 'group', 'class']}
subsetted_df: numpy.ndarray {n_samples, n_filtered_features + ['guess', 'group', 'class']}
Returns a DataFrame containing the `num_features` best features

Example Exported Code
---------------------

```Python
import numpy as np
import pandas as pd

from sklearn.cross_validation import train_test_split
from sklearn.ensemble import VotingClassifier
from sklearn.feature_selection import RFE
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import FunctionTransformer
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

# NOTE: Make sure that the class is labeled 'class' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR')
training_indices, testing_indices = train_test_split(tpot_data.index, stratify=tpot_data['class'].values, train_size=0.75, test_size=0.25)



# Use Scikit-learn's Recursive Feature Elimination (RFE) for feature selection
training_features = tpot_data.loc[training_indices].drop('class', axis=1)
training_class_vals = tpot_data.loc[training_indices, 'class'].values

if len(training_features.columns.values) == 0:
result1 = tpot_data.copy()
else:
selector = RFE(SVC(kernel='linear'), n_features_to_select=1, step=0.99)
selector.fit(training_features.values, training_class_vals)
mask = selector.get_support(True)
mask_cols = list(training_features.iloc[:, mask].columns) + ['class']
result1 = tpot_data[mask_cols]
input_data = np.recfromcsv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)
features = np.delete(input_data.view(np.float64).reshape(input_data.size, -1), input_data.dtype.names.index('class'), axis=1)
training_features, testing_features, training_classes, testing_classes = train_test_split(features, tpot_data['class'], random_state=42)

# Perform classification with a decision tree classifier
result2 = result1.copy()

dtc1 = DecisionTreeClassifier(max_features='auto', max_depth=None)
dtc1.fit(result2.loc[training_indices].drop('class', axis=1).values, result2.loc[training_indices, 'class'].values)
exported_pipeline = make_pipeline(
RFE(estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
decision_function_shape=None, degree=3, gamma='auto', kernel='linear',
max_iter=-1, probability=False, random_state=42, shrinking=True,
tol=0.001, verbose=False), step=0.96),
DecisionTreeClassifier(min_weight_fraction_leaf=0.5)
)

result2['dtc1-classification'] = dtc1.predict(result2.drop('class', axis=1).values)
exported_pipeline.fit(training_features, training_classes)
results = exported_pipeline.predict(testing_features)

```
Loading

0 comments on commit 6f63e83

Please sign in to comment.