Skip to content

Latest commit

 

History

History
569 lines (460 loc) · 10.9 KB

File metadata and controls

569 lines (460 loc) · 10.9 KB

Outliers

Importamos librerias basicas.

import pandas as pd
import missingno as msn
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")
import matplotlib
matplotlib.rcParams['figure.figsize'] = [10, 6]

Cargamos el archivo y vemos que contiene

data = pd.read_csv('breast-cancer.csv')
data.head()
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
id diagnosis radius_mean texture_mean perimeter_mean area_mean smoothness_mean compactness_mean concavity_mean concave points_mean ... texture_worst perimeter_worst area_worst smoothness_worst compactness_worst concavity_worst concave points_worst symmetry_worst fractal_dimension_worst Unnamed: 32
0 842302 M 17.99 10.38 122.80 1001.0 0.11840 0.27760 0.3001 0.14710 ... 17.33 184.60 2019.0 0.1622 0.6656 0.7119 0.2654 0.4601 0.11890 NaN
1 842517 M 20.57 17.77 132.90 1326.0 0.08474 0.07864 0.0869 0.07017 ... 23.41 158.80 1956.0 0.1238 0.1866 0.2416 0.1860 0.2750 0.08902 NaN
2 84300903 M 19.69 21.25 130.00 1203.0 0.10960 0.15990 0.1974 0.12790 ... 25.53 152.50 1709.0 0.1444 0.4245 0.4504 0.2430 0.3613 0.08758 NaN
3 84348301 M 11.42 20.38 77.58 386.1 0.14250 0.28390 0.2414 0.10520 ... 26.50 98.87 567.7 0.2098 0.8663 0.6869 0.2575 0.6638 0.17300 NaN
4 84358402 M 20.29 14.34 135.10 1297.0 0.10030 0.13280 0.1980 0.10430 ... 16.67 152.20 1575.0 0.1374 0.2050 0.4000 0.1625 0.2364 0.07678 NaN

5 rows × 33 columns

Eliminamos los missing values

data = data.drop(['Unnamed: 32', 'id'], 1)
data.head()
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
diagnosis radius_mean texture_mean perimeter_mean area_mean smoothness_mean compactness_mean concavity_mean concave points_mean symmetry_mean ... radius_worst texture_worst perimeter_worst area_worst smoothness_worst compactness_worst concavity_worst concave points_worst symmetry_worst fractal_dimension_worst
0 M 17.99 10.38 122.80 1001.0 0.11840 0.27760 0.3001 0.14710 0.2419 ... 25.38 17.33 184.60 2019.0 0.1622 0.6656 0.7119 0.2654 0.4601 0.11890
1 M 20.57 17.77 132.90 1326.0 0.08474 0.07864 0.0869 0.07017 0.1812 ... 24.99 23.41 158.80 1956.0 0.1238 0.1866 0.2416 0.1860 0.2750 0.08902
2 M 19.69 21.25 130.00 1203.0 0.10960 0.15990 0.1974 0.12790 0.2069 ... 23.57 25.53 152.50 1709.0 0.1444 0.4245 0.4504 0.2430 0.3613 0.08758
3 M 11.42 20.38 77.58 386.1 0.14250 0.28390 0.2414 0.10520 0.2597 ... 14.91 26.50 98.87 567.7 0.2098 0.8663 0.6869 0.2575 0.6638 0.17300
4 M 20.29 14.34 135.10 1297.0 0.10030 0.13280 0.1980 0.10430 0.1809 ... 22.54 16.67 152.20 1575.0 0.1374 0.2050 0.4000 0.1625 0.2364 0.07678

5 rows × 31 columns

Si recordamos de nuestro analisis de atributos pudimos ver algunas distribuciones con posibles outliers

data.mean()
radius_mean                 14.127292
texture_mean                19.289649
perimeter_mean              91.969033
area_mean                  654.889104
smoothness_mean              0.096360
compactness_mean             0.104341
concavity_mean               0.088799
concave points_mean          0.048919
symmetry_mean                0.181162
fractal_dimension_mean       0.062798
radius_se                    0.405172
texture_se                   1.216853
perimeter_se                 2.866059
area_se                     40.337079
smoothness_se                0.007041
compactness_se               0.025478
concavity_se                 0.031894
concave points_se            0.011796
symmetry_se                  0.020542
fractal_dimension_se         0.003795
radius_worst                16.269190
texture_worst               25.677223
perimeter_worst            107.261213
area_worst                 880.583128
smoothness_worst             0.132369
compactness_worst            0.254265
concavity_worst              0.272188
concave points_worst         0.114606
symmetry_worst               0.290076
fractal_dimension_worst      0.083946
dtype: float64
fig, ax = plt.subplots(figsize=(12,8))
sns.boxplot(ax=ax, data=data.iloc[:, : 3], palette='husl')

png

fig, ax = plt.subplots(figsize=(12,8))
sns.boxplot(ax=ax, data=data.iloc[:, 3: 4], palette='husl')

png

fig, ax = plt.subplots(figsize=(12,8))
sns.boxplot(ax=ax, data=data.iloc[:, 4: 5], palette='husl')

png

fig, ax = plt.subplots(figsize=(12,8))
sns.boxplot(ax=ax, data=data.iloc[:, 5: 11], palette='husl')

png

fig, ax = plt.subplots(figsize=(12,8))
sns.boxplot(ax=ax, data=data.iloc[:, 11: 13], palette='husl')

png

fig, ax = plt.subplots(figsize=(12,8))
sns.boxplot(ax=ax, data=data.iloc[:, 13:14], palette='husl')

png

fig, ax = plt.subplots(figsize=(12,8))
sns.boxplot(ax=ax, data=data.iloc[:, 14:15], palette='husl')

png

fig, ax = plt.subplots(figsize=(12,8))
sns.boxplot(ax=ax, data=data.iloc[:, 15:17], palette='husl')

png

fig, ax = plt.subplots(figsize=(12,8))
sns.boxplot(ax=ax, data=data.iloc[:, 17:20], palette='husl')

png

fig, ax = plt.subplots(figsize=(12,8))
sns.boxplot(ax=ax, data=data.iloc[:, 20:21])

png

fig, ax = plt.subplots(figsize=(12,8))
sns.boxplot(ax=ax, data=data.iloc[:, 21:24], palette='husl')

png

fig, ax = plt.subplots(figsize=(12,8))
sns.boxplot(ax=ax, data=data.iloc[:, 24:25], palette='husl')

png

fig, ax = plt.subplots(figsize=(12,8))
sns.boxplot(ax=ax, data=data.iloc[:, 25:], palette='husl')

png

Podemos ver outliers en casi todos los atributos. Vamos a quedarnos con los datos que esten como maximo a tres desviaciones estandar de la media.

l = len(data)
for column in data.columns[1:]:
    data = data[np.abs(data[column]-data[column].mean()) <= (3*data[column].std())]
print('Filas eliminadas: {0}'.format(l-len(data)))
Filas eliminadas: 142