import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import randint, uniform

from sklearn.model_selection import train_test_split, KFold, cross_validate, RandomizedSearchCV, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier as DTC
from sklearn import svm
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.neural_network import MLPClassifier

import warnings
warnings.filterwarnings('ignore')

df = pd.read_csv(f"features_lasso.csv")
df.head()

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3767 entries, 0 to 3766
Data columns (total 45 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   cocina        3767 non-null   int64  
 1   cocina_dor    3767 non-null   float64
 2   num_cuarto    3767 non-null   int64  
 3   bano_comp     3767 non-null   int64  
 4   bano_excus    3767 non-null   int64  
 5   bano_regad    3767 non-null   int64  
 6   estim_pago    3767 non-null   float64
 7   tot_resid     3767 non-null   int64  
 8   tipo_viv_4    3767 non-null   bool   
 9   mat_pared_7   3767 non-null   bool   
 10  mat_pared_8   3767 non-null   bool   
 11  mat_techos_3  3767 non-null   bool   
 12  mat_techos_4  3767 non-null   bool   
 13  mat_techos_6  3767 non-null   bool   
 14  mat_techos_8  3767 non-null   bool   
 15  mat_pisos_3   3767 non-null   bool   
 16  lugar_coc_4   3767 non-null   bool   
 17  lugar_coc_5   3767 non-null   bool   
 18  lugar_coc_6   3767 non-null   bool   
 19  ab_agua_5     3767 non-null   bool   
 20  ab_agua_6     3767 non-null   bool   
 21  ab_agua_7     3767 non-null   bool   
 22  dotac_agua_2  3767 non-null   bool   
 23  dotac_agua_4  3767 non-null   bool   
 24  dotac_agua_5  3767 non-null   bool   
 25  excusado_2    3767 non-null   bool   
 26  excusado_3    3767 non-null   bool   
 27  drenaje_2     3767 non-null   bool   
 28  drenaje_5     3767 non-null   bool   
 29  disp_elect_4  3767 non-null   bool   
 30  disp_elect_5  3767 non-null   bool   
 31  combus_3      3767 non-null   bool   
 32  combus_4      3767 non-null   bool   
 33  combus_7      3767 non-null   bool   
 34  eli_basura_2  3767 non-null   bool   
 35  eli_basura_4  3767 non-null   bool   
 36  eli_basura_5  3767 non-null   bool   
 37  eli_basura_6  3767 non-null   bool   
 38  tenencia_2    3767 non-null   bool   
 39  tenencia_4    3767 non-null   bool   
 40  tenencia_5    3767 non-null   bool   
 41  escrituras_4  3767 non-null   bool   
 42  escrituras_5  3767 non-null   bool   
 43  prop_muj      3767 non-null   float64
 44  est_socio     3767 non-null   int64  
dtypes: bool(35), float64(3), int64(7)
memory usage: 423.2 KB

# Cambia "target" por el nombre real de tu variable objetivo
X = df.drop(columns=["est_socio"])
y = df["est_socio"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.3,
    random_state=42,
    stratify=y
)

print("Entrenamiento:", X_train.shape, y_train.shape)
print("Prueba:", X_test.shape, y_test.shape)

Entrenamiento: (2636, 44) (2636,)
Prueba: (1131, 44) (1131,)

rf_params = {
    'n_estimators':      randint(100, 800),
    'max_depth':         [5, 8, 12, 20, None],
    'max_features':      ['sqrt', 'log2', 0.3, 0.5],
    'min_samples_leaf':  [1, 2, 4],
    'class_weight':      [None, 'balanced'],
}

scoring = ['accuracy', 'f1_macro', 'f1_weighted']
rf_search = RandomizedSearchCV(
    RandomForestClassifier(random_state=42),
    rf_params, n_iter=50, cv=5,
    scoring=scoring, refit='f1_weighted', random_state=42, n_jobs=-1
)

rf_search.fit(X_train, y_train)
print("RF — Mejores parámetros:", rf_search.best_params_)

RF — Mejores parámetros: {'class_weight': None, 'max_depth': 8, 'max_features': 0.5, 'min_samples_leaf': 4, 'n_estimators': 554}

idx = rf_search.best_index_   # índice de la combinación ganadora
resultados = rf_search.cv_results_

print("Accuracy  (CV):", resultados['mean_test_accuracy'][idx].round(4))
print("F1-macro  (CV):", resultados['mean_test_f1_macro'][idx].round(4))
print("F1-weighted (CV):", resultados['mean_test_f1_weighted'][idx].round(4))

Accuracy  (CV): 0.7234
F1-macro  (CV): 0.6543
F1-weighted (CV): 0.7139

ada_params = {
    'n_estimators':  [50, 100, 200, 300, 500],
    'learning_rate': [0.01, 0.05, 0.1, 0.5, 1.0],
    'estimator':     [DTC(max_depth=1), DTC(max_depth=2), DTC(max_depth=3)],
}

scoring = ['accuracy', 'f1_macro', 'f1_weighted']
ada_search = GridSearchCV(
    AdaBoostClassifier(random_state=42),
    ada_params, cv=5,
    scoring=scoring, refit='f1_weighted', n_jobs=-1
)
ada_search.fit(X_train, y_train)
print("AdaBoost — Mejores parámetros:", ada_search.best_params_)

AdaBoost — Mejores parámetros: {'estimator': DecisionTreeClassifier(max_depth=3), 'learning_rate': 0.05, 'n_estimators': 200}

idx = ada_search.best_index_   # índice de la combinación ganadora
resultados = ada_search.cv_results_

print("Accuracy  (CV):", resultados['mean_test_accuracy'][idx].round(4))
print("F1-macro  (CV):", resultados['mean_test_f1_macro'][idx].round(4))
print("F1-weighted (CV):", resultados['mean_test_f1_weighted'][idx].round(4))

Accuracy  (CV): 0.7204
F1-macro  (CV): 0.6504
F1-weighted (CV): 0.7126

svm_params = {
    # 'svc__C':            [0.1, 1, 10, 100],
    # 'svc__gamma':        ['scale', 'auto', 0.001, 0.01, 0.1],
    # 'svc__kernel':       ['rbf', 'linear'],
    # 'svc__class_weight': [None, 'balanced'],
    'svc__C':            [1],
    'svc__gamma':        ['auto'],
    'svc__kernel':       ['linear'],
    'svc__class_weight': ['balanced'],
}

scoring = ['accuracy', 'f1_macro', 'f1_weighted']
svm_pipe = Pipeline([('scaler', StandardScaler()), ('svc', svm.SVC())])
svm_search = GridSearchCV(
    svm_pipe, svm_params, cv=5,
    scoring=scoring, refit='f1_weighted', n_jobs=-1
)
svm_search.fit(X_train, y_train)
print("SVM — Mejores parámetros:", svm_search.best_params_)

SVM — Mejores parámetros: {'svc__C': 1, 'svc__class_weight': 'balanced', 'svc__gamma': 'auto', 'svc__kernel': 'linear'}

idx = svm_search.best_index_   # índice de la combinación ganadora
resultados = svm_search.cv_results_

print("Accuracy  (CV):", resultados['mean_test_accuracy'][idx].round(4))
print("F1-macro  (CV):", resultados['mean_test_f1_macro'][idx].round(4))
print("F1-weighted (CV):", resultados['mean_test_f1_weighted'][idx].round(4))

Accuracy  (CV): 0.6415
F1-macro  (CV): 0.632
F1-weighted (CV): 0.6526

nn_params = {
    'mlpclassifier__hidden_layer_sizes': [(64,), (128,), (64,32), (128,64)],
    'mlpclassifier__alpha':              [1e-4, 1e-3, 0.01, 0.1],
    'mlpclassifier__activation':         ['relu', 'tanh'],
    'mlpclassifier__learning_rate_init': [1e-4, 1e-3, 0.01],
}
nn_pipe = make_pipeline(
    StandardScaler(),
    MLPClassifier(max_iter=500, early_stopping=True, random_state=42)
)

scoring = ['accuracy', 'f1_macro', 'f1_weighted']
nn_search = RandomizedSearchCV(
    nn_pipe, nn_params, n_iter=30, cv=5,
    scoring=scoring, refit='f1_weighted', random_state=42, n_jobs=-1
)
nn_search.fit(X_train, y_train)
print("MLP — Mejores parámetros:", nn_search.best_params_)

MLP — Mejores parámetros: {'mlpclassifier__learning_rate_init': 0.01, 'mlpclassifier__hidden_layer_sizes': (128,), 'mlpclassifier__alpha': 0.0001, 'mlpclassifier__activation': 'tanh'}

idx = nn_search.best_index_
resultados = nn_search.cv_results_

scores_nn = pd.DataFrame({
    'test_accuracy':    [resultados[f'split{i}_test_accuracy'][idx]    for i in range(5)],
    'test_f1_macro':    [resultados[f'split{i}_test_f1_macro'][idx]    for i in range(5)],
    'test_f1_weighted': [resultados[f'split{i}_test_f1_weighted'][idx] for i in range(5)],
})

print(scores_nn, "\n")
print(scores_nn.mean())

   test_accuracy  test_f1_macro  test_f1_weighted
0       0.695076       0.629835          0.695314
1       0.700190       0.642559          0.701940
2       0.709677       0.652311          0.710799
3       0.721063       0.652681          0.718286
4       0.721063       0.673297          0.721691 

test_accuracy       0.709414
test_f1_macro       0.650137
test_f1_weighted    0.709606
dtype: float64

y_pred_rf = rf_search.predict(X_test)

print("=== Random Forest — Conjunto de prueba ===\n")
print(f"Accuracy: {accuracy_score(y_test, y_pred_rf):.4f}\n")
print(classification_report(y_test, y_pred_rf,
                             target_names=['Bajo','Medio Bajo','Medio Alto','Alto']))

cm_rf = confusion_matrix(y_test, y_pred_rf)
fig, ax = plt.subplots(figsize=(6, 5))
sns.heatmap(cm_rf, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Bajo','Medio Bajo','Medio Alto','Alto'],
            yticklabels=['Bajo','Medio Bajo','Medio Alto','Alto'], ax=ax)
ax.set_title("Matriz de Confusión — Random Forest")
ax.set_xlabel("Predicho")
ax.set_ylabel("Real")
plt.tight_layout()
plt.show()

=== Random Forest — Conjunto de prueba ===

Accuracy: 0.6958

              precision    recall  f1-score   support

        Bajo       0.67      0.51      0.58       111
  Medio Bajo       0.76      0.85      0.80       641
  Medio Alto       0.46      0.39      0.42       223
        Alto       0.72      0.60      0.66       156

    accuracy                           0.70      1131
   macro avg       0.65      0.59      0.62      1131
weighted avg       0.68      0.70      0.69      1131

y_pred_babc = ada_search.predict(X_test)

print("=== AdaBoost (Boosting) — Conjunto de prueba ===\n")
print(f"Accuracy: {accuracy_score(y_test, y_pred_babc):.4f}\n")
print(classification_report(y_test, y_pred_babc,
                             target_names=['Bajo','Medio Bajo','Medio Alto','Alto']))

cm_babc = confusion_matrix(y_test, y_pred_babc)
fig, ax = plt.subplots(figsize=(6, 5))
sns.heatmap(cm_babc, annot=True, fmt='d', cmap='Oranges',
            xticklabels=['Bajo','Medio Bajo','Medio Alto','Alto'],
            yticklabels=['Bajo','Medio Bajo','Medio Alto','Alto'], ax=ax)
ax.set_title("Matriz de Confusión — AdaBoost")
ax.set_xlabel("Predicho")
ax.set_ylabel("Real")
plt.tight_layout()
plt.show()

=== AdaBoost (Boosting) — Conjunto de prueba ===

Accuracy: 0.6852

              precision    recall  f1-score   support

        Bajo       0.67      0.52      0.59       111
  Medio Bajo       0.75      0.85      0.80       641
  Medio Alto       0.43      0.39      0.41       223
        Alto       0.73      0.56      0.64       156

    accuracy                           0.69      1131
   macro avg       0.64      0.58      0.61      1131
weighted avg       0.68      0.69      0.68      1131

y_pred_svm = svm_search.predict(X_test)

print("=== SVM (RBF kernel) — Conjunto de prueba ===\n")
print(f"Accuracy: {accuracy_score(y_test, y_pred_svm):.4f}\n")
print(classification_report(y_test, y_pred_svm,
                             target_names=['Bajo','Medio Bajo','Medio Alto','Alto']))

cm_svm = confusion_matrix(y_test, y_pred_svm)
fig, ax = plt.subplots(figsize=(6, 5))
sns.heatmap(cm_svm, annot=True, fmt='d', cmap='Greens',
            xticklabels=['Bajo','Medio Bajo','Medio Alto','Alto'],
            yticklabels=['Bajo','Medio Bajo','Medio Alto','Alto'], ax=ax)
ax.set_title("Matriz de Confusión — SVM")
ax.set_xlabel("Predicho")
ax.set_ylabel("Real")
plt.tight_layout()
plt.show()

=== SVM (RBF kernel) — Conjunto de prueba ===

Accuracy: 0.6012

              precision    recall  f1-score   support

        Bajo       0.48      0.91      0.63       111
  Medio Bajo       0.84      0.55      0.66       641
  Medio Alto       0.36      0.56      0.44       223
        Alto       0.65      0.67      0.66       156

    accuracy                           0.60      1131
   macro avg       0.58      0.67      0.60      1131
weighted avg       0.68      0.60      0.61      1131

y_pred_nn = nn_search.predict(X_test)

print("=== Red Neuronal (MLP) — Conjunto de prueba ===\n")
print(f"Accuracy: {accuracy_score(y_test, y_pred_nn):.4f}\n")
print(classification_report(y_test, y_pred_nn,
                             target_names=['Bajo','Medio Bajo','Medio Alto','Alto']))

cm_nn = confusion_matrix(y_test, y_pred_nn)
fig, ax = plt.subplots(figsize=(6, 5))
sns.heatmap(cm_nn, annot=True, fmt='d', cmap='Purples',
            xticklabels=['Bajo','Medio Bajo','Medio Alto','Alto'],
            yticklabels=['Bajo','Medio Bajo','Medio Alto','Alto'], ax=ax)
ax.set_title("Matriz de Confusión — Red Neuronal")
ax.set_xlabel("Predicho")
ax.set_ylabel("Real")
plt.tight_layout()
plt.show()

=== Red Neuronal (MLP) — Conjunto de prueba ===

Accuracy: 0.6861

              precision    recall  f1-score   support

        Bajo       0.60      0.53      0.56       111
  Medio Bajo       0.76      0.83      0.79       641
  Medio Alto       0.46      0.33      0.38       223
        Alto       0.66      0.72      0.69       156

    accuracy                           0.69      1131
   macro avg       0.62      0.60      0.61      1131
weighted avg       0.67      0.69      0.67      1131

# ── Tabla comparativa de métricas en prueba ──────────────────────────────────

modelos = ['Random Forest', 'AdaBoost', 'SVM', 'Red Neuronal']
preds   = [y_pred_rf, y_pred_babc, y_pred_svm, y_pred_nn]

resultados = []
for nombre, y_pred in zip(modelos, preds):
    resultados.append({
        'Modelo'        : nombre,
        'Accuracy'      : round(accuracy_score(y_test, y_pred), 4),
        'F1-macro'      : round(f1_score(y_test, y_pred, average='macro'), 4),
        'F1-weighted'   : round(f1_score(y_test, y_pred, average='weighted'), 4),
    })

df_resultados = pd.DataFrame(resultados).set_index('Modelo')
print("=== Resumen comparativo — conjunto de prueba ===\n")
print(df_resultados.to_string())

# ── Gráfica comparativa ───────────────────────────────────────────────────────
df_plot = df_resultados.reset_index().melt(id_vars='Modelo', var_name='Métrica', value_name='Valor')

fig, ax = plt.subplots(figsize=(10, 5))
sns.barplot(data=df_plot, x='Modelo', y='Valor', hue='Métrica', ax=ax)
ax.set_ylim(0.5, 0.85)
ax.set_title("Comparación de métricas en conjunto de prueba")
ax.set_ylabel("Valor")
ax.set_xlabel("")
ax.legend(title="Métrica")
for container in ax.containers:
    ax.bar_label(container, fmt='%.3f', padding=2, fontsize=8)
plt.tight_layout()
plt.show()

=== Resumen comparativo — conjunto de prueba ===

               Accuracy  F1-macro  F1-weighted
Modelo                                        
Random Forest    0.6958    0.6160       0.6866
AdaBoost         0.6852    0.6067       0.6768
SVM              0.6012    0.5972       0.6141
Red Neuronal     0.6861    0.6061       0.6740

	cocina	cocina_dor	num_cuarto	bano_comp	estim_pago	tot_resid	tipo_viv_4	mat_pared_7	...	eli_basura_4	eli_basura_5	eli_basura_6	tenencia_2	tenencia_4	tenencia_5	escrituras_4	escrituras_5	prop_muj	est_socio
0	1	2.0	4	1	3000.0	2	False	False	...	False	False	False	False	True	False	False	False	0.50	2
1	1	2.0	4	1	2500.0	4	False	False	...	False	False	False	False	False	False	False	True	0.75	2
2	1	2.0	5	2	3000.0	4	False	False	...	False	False	False	False	True	False	False	False	0.75	2
3	1	2.0	3	1	3000.0	2	False	False	...	False	False	False	False	True	False	False	False	0.50	2
4	1	2.0	4	1	2500.0	6	False	True	...	False	False	False	False	False	False	False	True	0.50	2

A2.3 Modelos de ensamble, SVM y redes neuronales¶

Introducción¶

Preparación del escenario experimental¶

Carga de librerías¶

Carga del dataset¶

Descripción del dataset¶

División entrenamiento-prueba¶

Construcción de modelos¶

Random Forest¶

Boosting¶

Support Vector Machine (SVM)¶

Red Neuronal¶

Evaluación y comparación de desempeño¶

Random Forest¶

Boosting¶

Support Vector Machine (SVM)¶

Red Neuronal¶

Tabla comparativa¶

Análisis crítico¶

¿El aumento en complejidad se tradujo en mejoras claras?¶

Riesgos de sobreajuste¶

Interpretabilidad relativa¶

¿Cuándo preferir cada modelo?¶

Conclusiones¶

Referencias¶

Modelo	Accuracy CV	Accuracy Prueba	Diferencia
Random Forest	0.7234	0.6958	−0.028
AdaBoost	0.7204	0.6852	−0.035
SVM	0.6415	0.6012	−0.040
Red Neuronal	0.7094	0.6861	−0.023