# Imports necesarios para graficas
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['figure.figsize'] = (16, 9)
plt.style.use('ggplot')
# Imports necesarios para arbol de decision
from sklearn import tree
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from IPython.display import Image as PImage
from subprocess import check_call
from PIL import Image, ImageDraw, ImageFont
# Imports necesarios para clasificacion bayesiana
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_selection import SelectKBest
El dataframe no presenta nombres en las columnas, como tal se busco la fuente del dataset y se agregan los nombres de las columnas
#Se importan datos de desempleo desde el PC a Colaboratory
from google.colab import files
import io
uploaded = files.upload()
adult_dataframe_leido = pd.read_csv(io.BytesIO(uploaded['adult.data']), names= ["age", "workclass", "fnlwgt", "education", "education-num", "marital-status", "occupation", "relationship", "race", "sex", "capital-gain", "capital-loss", "hours-per-week", "native-country", "income"] )
adult_dataframe_leido.head()
adult_dataframe_leido.shape
Son 32561 datos con 15 atributos
adult_dataframe_leido.groupby('income').size()
sb.factorplot('income',data=adult_dataframe_leido,kind="count")
Son 7841 personas que tienen ingreso mayores a $50000 US al año
colores= {" <=50K":'orange',
" >50K": 'blue'}
tamanios=[60,40]
f1 = adult_dataframe_leido['age'].values
f2 = adult_dataframe_leido['hours-per-week'].values
asignar=[]
for index, row in adult_dataframe_leido.iterrows():
asignar.append(colores[row['income']])
plt.figure(figsize=(16,10))
plt.scatter(f1, f2, c=asignar, s=30)
#plt.axis([1960,2005,0,600])
plt.show()
No se visualiza una correlación entre estos datos
f1 = adult_dataframe_leido['age'].values
f2 = adult_dataframe_leido['capital-gain'].values
asignar=[]
for index, row in adult_dataframe_leido.iterrows():
asignar.append(colores[row['income']])
plt.figure(figsize=(16,10))
plt.scatter(f1, f2, c=asignar, s=30)
#plt.axis([1960,2005,0,600])
plt.show()
No se nota una correlación
adult_dataframe_leido[adult_dataframe_leido['age'].isnull()==True]
adult_dataframe_leido[adult_dataframe_leido['workclass'].isnull()==True]
adult_dataframe_leido[adult_dataframe_leido['fnlwgt'].isnull()==True]
adult_dataframe_leido[adult_dataframe_leido['education'].isnull()==True]
adult_dataframe_leido[adult_dataframe_leido['education-num'].isnull()==True]
adult_dataframe_leido[adult_dataframe_leido['marital-status'].isnull()==True]
adult_dataframe_leido[adult_dataframe_leido['occupation'].isnull()==True]
adult_dataframe_leido[adult_dataframe_leido['relationship'].isnull()==True]
adult_dataframe_leido[adult_dataframe_leido['race'].isnull()==True]
adult_dataframe_leido[adult_dataframe_leido['sex'].isnull()==True]
adult_dataframe_leido[adult_dataframe_leido['capital-gain'].isnull()==True]
adult_dataframe_leido[adult_dataframe_leido['capital-loss'].isnull()==True]
adult_dataframe_leido[adult_dataframe_leido['hours-per-week'].isnull()==True]
adult_dataframe_leido[adult_dataframe_leido['native-country'].isnull()==True]
adult_dataframe_leido[adult_dataframe_leido['income'].isnull()==True]
separador = "### ### ###"
grouped11 = adult_dataframe_leido.groupby('workclass').size()
neworder11 = grouped11.sort_values(ascending=False)
print(" Cantidad de personas por clase de trabajo:")
print(neworder11)
print(separador)
grouped11 = adult_dataframe_leido.groupby('education').size()
neworder11 = grouped11.sort_values(ascending=False)
print(" Cantidad de personas por nivel de eduación:")
print(neworder11)
print(separador)
grouped11 = adult_dataframe_leido.groupby('marital-status').size()
neworder11 = grouped11.sort_values(ascending=False)
print(" Cantidad de personas por estado civil:")
print(neworder11)
print(separador)
grouped11 = adult_dataframe_leido.groupby('occupation').size()
neworder11 = grouped11.sort_values(ascending=False)
print(" Cantidad de personas por ocupación:")
print(neworder11)
print(separador)
grouped11 = adult_dataframe_leido.groupby('relationship').size()
neworder11 = grouped11.sort_values(ascending=False)
print(" Cantidad de personas relación:")
print(neworder11)
print(separador)
grouped11 = adult_dataframe_leido.groupby('race').size()
neworder11 = grouped11.sort_values(ascending=False)
print(" Cantidad de personas por raza:")
print(neworder11)
print(separador)
grouped11 = adult_dataframe_leido.groupby('sex').size()
neworder11 = grouped11.sort_values(ascending=False)
print(" Cantidad de personas por sexo:")
print(neworder11)
print(separador)
grouped11 = adult_dataframe_leido.groupby('native-country').size()
neworder11 = grouped11.sort_values(ascending=False)
print(" Cantidad de personas por paÃs de origen:")
print(neworder11)
print(separador)
Todos los de gobierno se tomarán del mismo tipo,
quedarÃa definido asi:
dict = {
" Private": 1,
" Self-emp-not-inc": 2,
" Local-gov": 0,
" ?": 4,
" State-gov": 0,
" Self-emp-inc": 3,
" Federal-gov": 0,
" Without-pay": 4,
" Never-worked": 4,
" " : 0
}
adult_dataframe_leido['workclassEncoded'] = adult_dataframe_leido['workclass'].map(dict).astype(int)
Casados se incluyen todos en el mismo,
adult_dataframe_leido['marital-statusEncoded'] = adult_dataframe_leido['marital-status'].map( {
' Married-civ-spouse': 0,
' Never-married': 1,
' Divorced': 2,
' Separated': 3,
' Widowed': 4,
' Married-spouse-absent': 0,
' Married-AF-spouse': 0
}).astype(int)
solo se agrupo fuerzas armadas con servicios de protección. Los otros valores ya estaban lo suficientemente segmentados
adult_dataframe_leido['occupationEncoded'] = adult_dataframe_leido['occupation'].map( {
' Prof-specialty' : 0,
' Craft-repair' : 1 ,
' Exec-managerial' : 2 ,
' Adm-clerical' : 3 ,
' Sales' : 4 ,
' Other-service' : 5 ,
' Machine-op-inspct' : 6 ,
' ?' : 13 ,
' Transport-moving' : 7 ,
' Handlers-cleaners' : 8 ,
' Farming-fishing' : 9 ,
' Tech-support' : 10 ,
' Protective-serv' : 11 ,
' Priv-house-serv' : 12 ,
' Armed-Forces' : 11
} ).astype(int)
Ya estan lo suficientemente segmentado.
adult_dataframe_leido['relationshipEncoded'] = adult_dataframe_leido['relationship'].map( {
' Husband' : 1,
' Not-in-family' : 0,
' Own-child': 3,
' Unmarried': 4,
' Wife': 2,
' Other-relative': 5,
'': 0} ).astype(int)
Ya esta lo suficientemente segmentado.
adult_dataframe_leido['raceEncoded'] = adult_dataframe_leido['race'].map( {
' White' : 0,
' Black' : 1,
' Asian-Pac-Islander': 2,
' Amer-Indian-Eskimo': 3,
' Other': 4} ).astype(int)
adult_dataframe_leido['sexEncoded'] = adult_dataframe_leido['sex'].map( {
' Male' : 0,
' Female' : 1,
}).astype(int)
Se clasificaran por continente, exceptuando a Estados Unidos porque tiene mas del 80% de los datos en el dataframe.
adult_dataframe_leido['native-countryEncoded'] = adult_dataframe_leido['native-country'].map( {
' United-States': 1,
' Mexico': 2,
' Canada': 2,
' ?': 8,
' South': 8,
' Philippines': 4,
' Germany': 3,
' Puerto-Rico': 7,
' El-Salvador': 7,
' India': 4,
' Cuba': 7,
' England': 3,
' Jamaica': 7,
' China': 4,
' Italy': 3,
' Dominican-Republic': 7,
' Vietnam': 4,
' Guatemala': 7,
' Japan': 4,
' Poland': 3,
' Taiwan': 4,
' Columbia': 6,
' Haiti': 7,
' Iran': 4,
' Portugal': 3,
' Nicaragua': 7,
' Peru': 6,
' Greece': 3,
' France': 3,
' Ecuador': 6,
' Ireland': 3,
' Hong': 4,
' Cambodia': 4,
' Trinadad&Tobago': 7,
' Laos': 4,
' Thailand': 4,
' Yugoslavia': 3,
' Outlying-US(Guam-USVI-etc)': 1,
' Honduras': 7,
' Hungary': 3,
' Scotland': 3,
' Holand-Netherlands': 7,
}).astype(int)
adult_dataframe_leido['incomeEncoded'] = adult_dataframe_leido['income'].map( {
' <=50K' : 0,
' >50K' : 1,
}).astype(int)
grouped11 = adult_dataframe_leido.groupby('education').size()
neworder11 = grouped11.sort_values(ascending=False)
print(" Cantidad de personas por nivel de educación:")
print(neworder11)
grouped11 = adult_dataframe_leido.groupby('education-num').size()
neworder11 = grouped11.sort_values(ascending=False)
print(" Cantidad de personas por numero de educación:")
print(neworder11)
# Mapping Age por decadas
adult_dataframe_age = adult_dataframe_leido['age']
adult_dataframe_leido.loc[ adult_dataframe_age < 10, 'ageEncoded'] = 0
adult_dataframe_leido.loc[(adult_dataframe_age < 20) & (adult_dataframe_age >= 10), 'ageEncoded'] = 1
adult_dataframe_leido.loc[(adult_dataframe_age < 30) & (adult_dataframe_age >= 20), 'ageEncoded'] = 2
adult_dataframe_leido.loc[(adult_dataframe_age < 40) & (adult_dataframe_age >= 30), 'ageEncoded'] = 3
adult_dataframe_leido.loc[(adult_dataframe_age < 50) & (adult_dataframe_age >= 40), 'ageEncoded'] = 4
adult_dataframe_leido.loc[(adult_dataframe_age < 60) & (adult_dataframe_age >= 50), 'ageEncoded'] = 5
adult_dataframe_leido.loc[(adult_dataframe_age < 70) & (adult_dataframe_age >= 60), 'ageEncoded'] = 6
adult_dataframe_leido.loc[(adult_dataframe_age < 80) & (adult_dataframe_age >= 70), 'ageEncoded'] = 7
adult_dataframe_leido.loc[(adult_dataframe_age < 90) & (adult_dataframe_age >= 80), 'ageEncoded'] = 8
adult_dataframe_leido.loc[(adult_dataframe_age < 100) & (adult_dataframe_age >= 90), 'ageEncoded'] = 9
#adult_dataframe_leido.astype({"ageEncoded":int})
#EL MAXIMO DE CAPITAL-GAIN
adult_dataframe_leido["capital-gain"].max()
# Mapping capital-gain de a intervalos de 10000
adult_dataframe_capital = adult_dataframe_leido['capital-gain']
adult_dataframe_leido.loc[ adult_dataframe_capital < 10000, 'capital-gainEncoded'] = 0
adult_dataframe_leido.loc[(adult_dataframe_capital < 20000) & (adult_dataframe_capital >= 10000), 'capital-gainEncoded'] = 1
adult_dataframe_leido.loc[(adult_dataframe_capital < 30000) & (adult_dataframe_capital >= 20000), 'capital-gainEncoded'] = 2
adult_dataframe_leido.loc[(adult_dataframe_capital >= 30000), 'capital-gainEncoded'] = 3
#adult_dataframe_leido.astype({"capital-gainEncoded":int})
#EL MAXIMO DE CAPITAL-Loss
adult_dataframe_leido["capital-loss"].max()
# Mapping capital-loss de a intervalos de 1000
adult_dataframe_capital = adult_dataframe_leido['capital-loss']
adult_dataframe_leido.loc[ adult_dataframe_capital < 1000, 'capital-lossEncoded'] = 0
adult_dataframe_leido.loc[(adult_dataframe_capital < 2000) & (adult_dataframe_capital >= 1000), 'capital-lossEncoded'] = 1
adult_dataframe_leido.loc[(adult_dataframe_capital >= 2000), 'capital-lossEncoded'] = 2
#adult_dataframe_leido.astype({"capital-lossEncoded":int})
#EL MAXIMO DE Hours-Per_Week
adult_dataframe_leido["hours-per-week"].max()
# Mapping hpurs-per-week de a intervalos de 10
adult_dataframe_hours = adult_dataframe_leido['hours-per-week']
adult_dataframe_leido.loc[ adult_dataframe_hours < 10, 'hours-per-weekEncoded'] = 0
adult_dataframe_leido.loc[(adult_dataframe_hours < 20) & (adult_dataframe_hours >= 10), 'hours-per-weekEncoded'] = 1
adult_dataframe_leido.loc[(adult_dataframe_hours < 30) & (adult_dataframe_hours >= 20), 'hours-per-weekEncoded'] = 2
adult_dataframe_leido.loc[(adult_dataframe_hours < 40) & (adult_dataframe_hours >= 30), 'hours-per-weekEncoded'] = 3
adult_dataframe_leido.loc[(adult_dataframe_hours < 50) & (adult_dataframe_hours >= 40), 'hours-per-weekEncoded'] = 4
adult_dataframe_leido.loc[(adult_dataframe_hours < 60) & (adult_dataframe_hours >= 50), 'hours-per-weekEncoded'] = 5
adult_dataframe_leido.loc[(adult_dataframe_hours < 70) & (adult_dataframe_hours >= 60), 'hours-per-weekEncoded'] = 6
adult_dataframe_leido.loc[(adult_dataframe_hours < 80) & (adult_dataframe_hours >= 70), 'hours-per-weekEncoded'] = 7
adult_dataframe_leido.loc[(adult_dataframe_hours < 90) & (adult_dataframe_hours >= 80), 'hours-per-weekEncoded'] = 8
adult_dataframe_leido.loc[(adult_dataframe_hours < 100) & (adult_dataframe_hours >= 90), 'hours-per-weekEncoded'] = 9
#adult_dataframe_leido.astype({"hours-per-weekEncoded":int})
#EL MAXIMO DE fnlwgt
adult_dataframe_leido["fnlwgt"].max()
# Mapping hpurs-per-week de a intervalos de 100000
adult_dataframe_fnl = adult_dataframe_leido['fnlwgt']
adult_dataframe_leido.loc[ adult_dataframe_fnl < 100000, 'fnlwgtEncoded'] = 0
adult_dataframe_leido.loc[(adult_dataframe_fnl < 200000) & (adult_dataframe_fnl >= 100000), 'fnlwgtEncoded'] = 1
adult_dataframe_leido.loc[(adult_dataframe_fnl < 300000) & (adult_dataframe_fnl >= 200000), 'fnlwgtEncoded'] = 2
adult_dataframe_leido.loc[(adult_dataframe_fnl < 400000) & (adult_dataframe_fnl >= 300000), 'fnlwgtEncoded'] = 3
adult_dataframe_leido.loc[(adult_dataframe_fnl < 500000) & (adult_dataframe_fnl >= 400000), 'fnlwgtEncoded'] = 4
adult_dataframe_leido.loc[(adult_dataframe_fnl < 600000) & (adult_dataframe_fnl >= 500000), 'fnlwgtEncoded'] = 5
adult_dataframe_leido.loc[(adult_dataframe_fnl >= 600000), 'fnlwgtEncoded'] = 6
#adult_dataframe_leido.astype({"hours-per-weekEncoded":int})
drop_elements = ['age',
'workclass',
'fnlwgt',
'education',
'marital-status',
'occupation',
'relationship',
'race',
'sex',
'capital-gain',
'capital-loss',
'hours-per-week',
'native-country',
'income'
]
adult_dataframe_encoded = adult_dataframe_leido.drop(drop_elements, axis = 1)
##Verificamos el dataframe mapeado
adult_dataframe_encoded
#verificamos dimensiones
adult_dataframe_encoded.shape
#Verifiquemos estadisticos
adult_dataframe_encoded.describe()
colormap = plt.cm.viridis
plt.figure(figsize=(13,13))
plt.title('Pearson Correlation of Features')
sb.heatmap(adult_dataframe_encoded.astype(float).corr(),
vmax=1.0,
cmap=colormap,
annot=True,
linewidths=0.1,
linecolor='white',
square=True)
cv = KFold(n_splits=5) # Numero deseado de "folds" que haremos
accuracies = list()
max_attributes = len(list(adult_dataframe_encoded))
depth_range = range(1, max_attributes + 1)
# Testearemos la profundidad de 1 a cantidad de atributos +1
for depth in depth_range:
fold_accuracy = []
tree_model = tree.DecisionTreeClassifier(criterion='entropy',
min_samples_split=20,
min_samples_leaf=5,
max_depth = depth,
class_weight={1:3.15}) # Por defecto todas las clases igual peso
# Columna 1 (top) con peso de 3.15
for train_fold, valid_fold in cv.split(adult_dataframe_encoded):
f_train = adult_dataframe_encoded.loc[train_fold]
f_valid = adult_dataframe_encoded.loc[valid_fold]
model = tree_model.fit(X = f_train.drop(['incomeEncoded'], axis=1),
y = f_train["incomeEncoded"])
valid_acc = model.score(X = f_valid.drop(['incomeEncoded'], axis=1),
y = f_valid["incomeEncoded"]) # calculamos la precision con el segmento de validacion
fold_accuracy.append(valid_acc)
avg = sum(fold_accuracy)/len(fold_accuracy)
accuracies.append(avg)
# Mostramos los resultados obtenidos
df = pd.DataFrame({"Max Depth": depth_range, "Average Accuracy": accuracies})
df = df[["Max Depth", "Average Accuracy"]]
print(df.to_string(index=False))
grouped11 = adult_dataframe_leido.groupby('incomeEncoded').size()
neworder11 = grouped11.sort_values(ascending=False)
print(" Cantidad de personas por incomeEncoded:")
print(neworder11)
print(separador)
# Crear arrays de entrenamiento y las etiquetas que indican las ganancias mayores o menores a 50K
y_train = adult_dataframe_encoded['incomeEncoded']
x_train = adult_dataframe_encoded.drop(['incomeEncoded'], axis=1).values
# Crear Arbol de decision con profundidad = 4
decision_tree = tree.DecisionTreeClassifier(criterion='entropy',
min_samples_split=20,
min_samples_leaf=5,
max_depth = 4,
class_weight={1:3.15})
decision_tree.fit(x_train, y_train)
# exportar el modelo a archivo .dot
with open(r"tree1.dot", 'w') as f:
f = tree.export_graphviz(decision_tree,
out_file=f,
max_depth = 7,
impurity = True,
feature_names = list(adult_dataframe_encoded.drop(['incomeEncoded'], axis=1)),
class_names = [' <=50K', ' >50K'],
rounded = True,
filled= True )
# Convertir el archivo .dot a png para poder visualizarlo
check_call(['dot','-Tpng',r'tree1.dot','-o',r'tree1.png'])
PImage("tree1.png")
acc_decision_tree = round(decision_tree.score(x_train, y_train) * 100, 2)
print(acc_decision_tree,'%')
# Backup
adult_data = adult_dataframe_encoded.copy()
# Restore
adult_dataframe_encoded = adult_data.copy()
X=adult_dataframe_encoded.drop(['incomeEncoded'], axis=1)
y=adult_dataframe_encoded['incomeEncoded']
best=SelectKBest(k=5)
X_new = best.fit_transform(X, y)
selected = best.get_support(indices=True)
print(X.columns[selected])
X_new.shape
used_features =X.columns[selected]
colormap = plt.cm.viridis
plt.figure(figsize=(12,12))
plt.title('Pearson Correlation of Features', y=1.05, size=15)
sb.heatmap(adult_dataframe_encoded[used_features].astype(float).corr(),linewidths=0.1,vmax=1.0, square=True, cmap=colormap, linecolor='black', annot=True)
# Split dataset in training and test datasets
X_train, X_test = train_test_split(adult_dataframe_encoded, test_size=0.2, random_state=6)
y_train =X_train["incomeEncoded"]
y_test = X_test["incomeEncoded"]
# Instantiate the classifier
gnb = GaussianNB()
# Train classifier
gnb.fit(
X_train[used_features].values,
y_train
)
y_pred = gnb.predict(X_test[used_features])
print('Accuracy in training set: {:.2f}'
.format(gnb.score(X_train[used_features], y_train)))
print('Accuracy in test set: {:.2f}'
.format(gnb.score(X_test[used_features], y_test)))
# Bill Gates arbol
x_test = pd.DataFrame(columns=('education-num', 'workclassEncoded', 'marital-statusEncoded', 'occupationEncoded', 'relationshipEncoded', 'raceEncoded', 'sexEncoded', 'native-countryEncoded', 'incomeEncoded', 'ageEncoded', 'capital-gainEncoded', 'capital-lossEncoded', 'hours-per-weekEncoded', 'fnlwgtEncoded'))
x_test.loc[0] = (10,1,0,2,1,0,0,1,1,6,3,0,4,0)
y_pred = decision_tree.predict(x_test.drop(['incomeEncoded'], axis = 1))
print("Prediccion del DT: " + str(y_pred))
y_proba = decision_tree.predict_proba(x_test.drop(['incomeEncoded'], axis = 1))
print("Probabilidad de Acierto: " + str(np.round(y_proba[0][y_pred]* 100, 2)) + "%")
#Bill Gates bayesiano
# ['education-num', 'marital-statusEncoded', 'ageEncoded', 'capital-gainEncoded', 'hours-per-weekEncoded'] <-- para bayesiana
billBayesiano = [10, 0, 6, 3, 4]
print("Prediccion Bayesiana: "+ str(gnb.predict([billBayesiano])))
# Obama arbol
x_test = pd.DataFrame(columns=('education-num', 'workclassEncoded', 'marital-statusEncoded', 'occupationEncoded', 'relationshipEncoded', 'raceEncoded', 'sexEncoded', 'native-countryEncoded', 'incomeEncoded', 'ageEncoded', 'capital-gainEncoded', 'capital-lossEncoded', 'hours-per-weekEncoded', 'fnlwgtEncoded'))
x_test.loc[0] = (16,0,0,2,1,1,0,1,1,5,3,0,5,0)
y_pred = decision_tree.predict(x_test.drop(['incomeEncoded'], axis = 1))
print("Prediccion del DT: " + str(y_pred))
y_proba = decision_tree.predict_proba(x_test.drop(['incomeEncoded'], axis = 1))
print("Probabilidad de Acierto: " + str(np.round(y_proba[0][y_pred]* 100, 2)) + "%")
#Obama bayesiano
# ['education-num', 'marital-statusEncoded', 'ageEncoded', 'capital-gainEncoded', 'hours-per-weekEncoded'] <-- para bayesiana
obamaBayesiano = [16, 0, 5, 3, 5]
print("Prediccion Bayesiana: "+ str(gnb.predict([obamaBayesiano])))
# Persona arbol
x_test = pd.DataFrame(columns=('education-num', 'workclassEncoded', 'marital-statusEncoded', 'occupationEncoded', 'relationshipEncoded', 'raceEncoded', 'sexEncoded', 'native-countryEncoded', 'incomeEncoded', 'ageEncoded', 'capital-gainEncoded', 'capital-lossEncoded', 'hours-per-weekEncoded', 'fnlwgtEncoded'))
x_test.loc[0] = (10,3,0,13,2,0,1,6,0,5,0,0,4,1)
y_pred = decision_tree.predict(x_test.drop(['incomeEncoded'], axis = 1))
print("Prediccion del DT: " + str(y_pred))
y_proba = decision_tree.predict_proba(x_test.drop(['incomeEncoded'], axis = 1))
print("Probabilidad de Acierto: " + str(np.round(y_proba[0][y_pred]* 100, 2)) + "%")
# Persona bayesiano
# ['education-num', 'marital-statusEncoded', 'ageEncoded', 'capital-gainEncoded', 'hours-per-weekEncoded'] <-- para bayesiana
personaBayesiano = [10, 0, 5, 0, 4]
print("Prediccion Bayesiana: "+ str(gnb.predict([personaBayesiano])))