Importamos librerias basicas.
import pandas as pd
import missingno as msn
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")
import matplotlib
matplotlib.rcParams['figure.figsize'] = [10, 6]
Cargamos el archivo y vemos que contiene
data = pd.read_csv('breast-cancer.csv')
data.head()
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
</style>
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
id | diagnosis | radius_mean | texture_mean | perimeter_mean | area_mean | smoothness_mean | compactness_mean | concavity_mean | concave points_mean | ... | texture_worst | perimeter_worst | area_worst | smoothness_worst | compactness_worst | concavity_worst | concave points_worst | symmetry_worst | fractal_dimension_worst | Unnamed: 32 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 842302 | M | 17.99 | 10.38 | 122.80 | 1001.0 | 0.11840 | 0.27760 | 0.3001 | 0.14710 | ... | 17.33 | 184.60 | 2019.0 | 0.1622 | 0.6656 | 0.7119 | 0.2654 | 0.4601 | 0.11890 | NaN |
1 | 842517 | M | 20.57 | 17.77 | 132.90 | 1326.0 | 0.08474 | 0.07864 | 0.0869 | 0.07017 | ... | 23.41 | 158.80 | 1956.0 | 0.1238 | 0.1866 | 0.2416 | 0.1860 | 0.2750 | 0.08902 | NaN |
2 | 84300903 | M | 19.69 | 21.25 | 130.00 | 1203.0 | 0.10960 | 0.15990 | 0.1974 | 0.12790 | ... | 25.53 | 152.50 | 1709.0 | 0.1444 | 0.4245 | 0.4504 | 0.2430 | 0.3613 | 0.08758 | NaN |
3 | 84348301 | M | 11.42 | 20.38 | 77.58 | 386.1 | 0.14250 | 0.28390 | 0.2414 | 0.10520 | ... | 26.50 | 98.87 | 567.7 | 0.2098 | 0.8663 | 0.6869 | 0.2575 | 0.6638 | 0.17300 | NaN |
4 | 84358402 | M | 20.29 | 14.34 | 135.10 | 1297.0 | 0.10030 | 0.13280 | 0.1980 | 0.10430 | ... | 16.67 | 152.20 | 1575.0 | 0.1374 | 0.2050 | 0.4000 | 0.1625 | 0.2364 | 0.07678 | NaN |
5 rows × 33 columns
Eliminamos los missing values
data = data.drop(['Unnamed: 32', 'id'], 1)
data.head()
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
</style>
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
diagnosis | radius_mean | texture_mean | perimeter_mean | area_mean | smoothness_mean | compactness_mean | concavity_mean | concave points_mean | symmetry_mean | ... | radius_worst | texture_worst | perimeter_worst | area_worst | smoothness_worst | compactness_worst | concavity_worst | concave points_worst | symmetry_worst | fractal_dimension_worst | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | M | 17.99 | 10.38 | 122.80 | 1001.0 | 0.11840 | 0.27760 | 0.3001 | 0.14710 | 0.2419 | ... | 25.38 | 17.33 | 184.60 | 2019.0 | 0.1622 | 0.6656 | 0.7119 | 0.2654 | 0.4601 | 0.11890 |
1 | M | 20.57 | 17.77 | 132.90 | 1326.0 | 0.08474 | 0.07864 | 0.0869 | 0.07017 | 0.1812 | ... | 24.99 | 23.41 | 158.80 | 1956.0 | 0.1238 | 0.1866 | 0.2416 | 0.1860 | 0.2750 | 0.08902 |
2 | M | 19.69 | 21.25 | 130.00 | 1203.0 | 0.10960 | 0.15990 | 0.1974 | 0.12790 | 0.2069 | ... | 23.57 | 25.53 | 152.50 | 1709.0 | 0.1444 | 0.4245 | 0.4504 | 0.2430 | 0.3613 | 0.08758 |
3 | M | 11.42 | 20.38 | 77.58 | 386.1 | 0.14250 | 0.28390 | 0.2414 | 0.10520 | 0.2597 | ... | 14.91 | 26.50 | 98.87 | 567.7 | 0.2098 | 0.8663 | 0.6869 | 0.2575 | 0.6638 | 0.17300 |
4 | M | 20.29 | 14.34 | 135.10 | 1297.0 | 0.10030 | 0.13280 | 0.1980 | 0.10430 | 0.1809 | ... | 22.54 | 16.67 | 152.20 | 1575.0 | 0.1374 | 0.2050 | 0.4000 | 0.1625 | 0.2364 | 0.07678 |
5 rows × 31 columns
Si recordamos de nuestro analisis de atributos pudimos ver algunas distribuciones con posibles outliers
data.mean()
radius_mean 14.127292
texture_mean 19.289649
perimeter_mean 91.969033
area_mean 654.889104
smoothness_mean 0.096360
compactness_mean 0.104341
concavity_mean 0.088799
concave points_mean 0.048919
symmetry_mean 0.181162
fractal_dimension_mean 0.062798
radius_se 0.405172
texture_se 1.216853
perimeter_se 2.866059
area_se 40.337079
smoothness_se 0.007041
compactness_se 0.025478
concavity_se 0.031894
concave points_se 0.011796
symmetry_se 0.020542
fractal_dimension_se 0.003795
radius_worst 16.269190
texture_worst 25.677223
perimeter_worst 107.261213
area_worst 880.583128
smoothness_worst 0.132369
compactness_worst 0.254265
concavity_worst 0.272188
concave points_worst 0.114606
symmetry_worst 0.290076
fractal_dimension_worst 0.083946
dtype: float64
fig, ax = plt.subplots(figsize=(12,8))
sns.boxplot(ax=ax, data=data.iloc[:, : 3], palette='husl')
fig, ax = plt.subplots(figsize=(12,8))
sns.boxplot(ax=ax, data=data.iloc[:, 3: 4], palette='husl')
fig, ax = plt.subplots(figsize=(12,8))
sns.boxplot(ax=ax, data=data.iloc[:, 4: 5], palette='husl')
fig, ax = plt.subplots(figsize=(12,8))
sns.boxplot(ax=ax, data=data.iloc[:, 5: 11], palette='husl')
fig, ax = plt.subplots(figsize=(12,8))
sns.boxplot(ax=ax, data=data.iloc[:, 11: 13], palette='husl')
fig, ax = plt.subplots(figsize=(12,8))
sns.boxplot(ax=ax, data=data.iloc[:, 13:14], palette='husl')
fig, ax = plt.subplots(figsize=(12,8))
sns.boxplot(ax=ax, data=data.iloc[:, 14:15], palette='husl')
fig, ax = plt.subplots(figsize=(12,8))
sns.boxplot(ax=ax, data=data.iloc[:, 15:17], palette='husl')
fig, ax = plt.subplots(figsize=(12,8))
sns.boxplot(ax=ax, data=data.iloc[:, 17:20], palette='husl')
fig, ax = plt.subplots(figsize=(12,8))
sns.boxplot(ax=ax, data=data.iloc[:, 20:21])
fig, ax = plt.subplots(figsize=(12,8))
sns.boxplot(ax=ax, data=data.iloc[:, 21:24], palette='husl')
fig, ax = plt.subplots(figsize=(12,8))
sns.boxplot(ax=ax, data=data.iloc[:, 24:25], palette='husl')
fig, ax = plt.subplots(figsize=(12,8))
sns.boxplot(ax=ax, data=data.iloc[:, 25:], palette='husl')
Podemos ver outliers en casi todos los atributos. Vamos a quedarnos con los datos que esten como maximo a tres desviaciones estandar de la media.
l = len(data)
for column in data.columns[1:]:
data = data[np.abs(data[column]-data[column].mean()) <= (3*data[column].std())]
print('Filas eliminadas: {0}'.format(l-len(data)))
Filas eliminadas: 142