import pandas as pd 

df = pd.read_csv('features_lasso.csv')

from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick

TARGET   = 'est_socio'
DROP     = [TARGET]   # excluir variable objetivo 
FEATURES = [c for c in df.columns if c not in DROP]

X = df[FEATURES]
y = df[TARGET]

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.20,
    random_state=42,
    stratify=y
)

print(f"Total de observaciones : {len(df):>5}")
print(f"Entrenamiento (80 %)   : {len(X_train):>5}")
print(f"Prueba        (20 %)   : {len(X_test):>5}")

Total de observaciones :  3767
Entrenamiento (80 %)   :  3013
Prueba        (20 %)   :   754

etiquetas = {1: "Bajo", 2: "Medio bajo", 3: "Medio alto", 4: "Alto"}

categorias = [etiquetas[k] for k in sorted(etiquetas)]
colores    = ["#4C72B0", "#DD8452", "#36BB55", "#C44E52"]

fig, axes = plt.subplots(1, 3, figsize=(15, 5), sharey=True)

for ax, (serie, titulo, n) in zip(axes, [
    (y,       "Dataset original",      len(y)),
    (y_train, "Entrenamiento 80 %",    len(y_train)),
    (y_test,  "Prueba 20 %",           len(y_test)),
]):
    pcts = serie.value_counts(normalize=True).sort_index() * 100
    bars = ax.bar(categorias, pcts.values, color=colores,
                  edgecolor="white", linewidth=1.2, width=0.6)
    ax.set_title(f"{titulo}\n(n = {n:,})", fontsize=12, fontweight="bold", pad=10)
    ax.set_xlabel("Estrato socioeconómico", fontsize=10)
    ax.set_ylabel("Proporción (%)", fontsize=10)
    ax.set_ylim(0, 75)
    ax.yaxis.set_major_formatter(mtick.PercentFormatter())
    ax.tick_params(axis="x", rotation=15)
    ax.spines[["top", "right"]].set_visible(False)
    for bar, val in zip(bars, pcts.values):
        ax.text(bar.get_x() + bar.get_width() / 2,
                bar.get_height() + 1,
                f"{val:.1f}%", ha="center", va="bottom",
                fontsize=9, fontweight="bold")

plt.suptitle("Balance de clases: Original vs. Entrenamiento vs. Prueba",
             fontsize=13, y=1.02, fontweight="bold")
plt.tight_layout()
plt.show()

from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('clf',    LogisticRegression(max_iter=5000, solver='lbfgs', random_state=42))
])

cv      = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = ['accuracy', 'f1_macro', 'f1_weighted']

cv_results = cross_validate(
    pipe, X_train, y_train,
    cv=cv,
    scoring=scoring,
    return_train_score=True
)

metricas = {
    'Accuracy':       ('train_accuracy',              'test_accuracy'),
    'F1-macro':       ('train_f1_macro',               'test_f1_macro'),
    'F1-weighted':    ('train_f1_weighted',             'test_f1_weighted'),
}

rows = []
for nombre, (train_key, test_key) in metricas.items():
    tr = cv_results[train_key]
    te = cv_results[test_key]
    row = {'Métrica': nombre}
    for i, (t, v) in enumerate(zip(tr, te), 1):
        row[f'Train F{i}'] = round(t, 4)
        row[f'Val F{i}']   = round(v, 4)
    row['Train μ']  = round(tr.mean(), 4)
    row['Val μ']    = round(te.mean(), 4)
    row['Val σ']    = round(te.std(),  4)
    rows.append(row)

tabla_cv = pd.DataFrame(rows).set_index('Métrica')
print("Resultados de validación cruzada (5 pliegues estratificados):")
print(tabla_cv.to_string())

Resultados de validación cruzada (5 pliegues estratificados):
             Train F1  Val F1  Train F2  Val F2  Train F3  Val F3  Train F4  Val F4  Train F5  Val F5  Train μ   Val μ   Val σ
Métrica                                                                                                                       
Accuracy       0.7402  0.7148    0.7469  0.7247    0.7427  0.7081    0.7412  0.7342    0.7424  0.7226   0.7427  0.7209  0.0089
F1-macro       0.6753  0.6380    0.6877  0.6525    0.6775  0.6447    0.6805  0.6621    0.6768  0.6653   0.6796  0.6525  0.0103
F1-weighted    0.7282  0.7028    0.7379  0.7078    0.7308  0.6986    0.7309  0.7229    0.7306  0.7156   0.7317  0.7095  0.0088

# ── Distribución real de clases ───────────────────────────────────────────────
conteos   = y.value_counts().sort_index()
total     = len(y)

# ── Clase mayoritaria (el modelo más flojo posible) ───────────────────────────
clase_mayoritaria     = conteos.idxmax()
accuracy_baseline     = conteos.max() / total

# ── Comparación contra tu modelo real ────────────────────────────────────────
accuracy_modelo = 0.7209   # Val μ de tu validación cruzada
mejora_real     = accuracy_modelo - accuracy_baseline

# ── Visualización ─────────────────────────────────────────────────────────────
fig, ax = plt.subplots(figsize=(7, 4))

barras = ax.bar(
    ['Baseline\n(siempre Medio bajo)', 'Tu modelo\n(Reg. Logística)'],
    [accuracy_baseline, accuracy_modelo],
    color=['#C44E52', '#4C72B0'],
    width=0.4, edgecolor='white'
)

# Línea de referencia baseline
ax.axhline(accuracy_baseline, color='#C44E52', linestyle='--',
           linewidth=1.2, alpha=0.7)

# Anotación de la mejora real
ax.annotate('',
    xy=(1, accuracy_modelo),
    xytext=(1, accuracy_baseline),
    arrowprops=dict(arrowstyle='<->', color='black', lw=1.5)
)
ax.text(1.05, (accuracy_modelo + accuracy_baseline) / 2,
        f'+{mejora_real*100:.1f} pp', va='center', fontsize=10, color='black')

# Etiquetas sobre las barras
for bar, val in zip(barras, [accuracy_baseline, accuracy_modelo]):
    ax.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.005,
            f'{val:.4f}', ha='center', fontsize=11, fontweight='bold')

ax.set_ylim(0, 1)
ax.set_ylabel('Accuracy', fontsize=11)
ax.set_title('Accuracy real vs. baseline (clasificador trivial)', fontsize=12, fontweight='bold')
ax.spines[['top', 'right']].set_visible(False)
plt.tight_layout()
plt.show()

from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

# 1. Entrenamiento final con todo el conjunto de entrenamiento
pipe.fit(X_train, y_train)

# 2. Predicción sobre el conjunto de prueba
y_pred = pipe.predict(X_test)

# 3. Reporte de clasificación
print("Reporte de Clasificación en el Conjunto de Prueba:")
print("-" * 60)
print(classification_report(y_test, y_pred, target_names=categorias))

# 4. Matriz de Confusión
cm = confusion_matrix(y_test, y_pred)
fig, ax = plt.subplots(figsize=(8, 6))
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=categorias)
disp.plot(cmap='Blues', ax=ax, values_format='d')

plt.title("Matriz de Confusión - Conjunto de Prueba", fontsize=14, fontweight="bold", pad=15)
plt.xlabel("Predicción del Modelo", fontsize=11)
plt.ylabel("Etiqueta Real", fontsize=11)
ax.spines[['top', 'right', 'bottom', 'left']].set_visible(False)
plt.show()

Reporte de Clasificación en el Conjunto de Prueba:
------------------------------------------------------------
              precision    recall  f1-score   support

        Bajo       0.70      0.57      0.63        74
  Medio bajo       0.76      0.85      0.80       427
  Medio alto       0.43      0.37      0.40       149
        Alto       0.70      0.61      0.65       104

    accuracy                           0.69       754
   macro avg       0.65      0.60      0.62       754
weighted avg       0.68      0.69      0.68       754

from sklearn.metrics import precision_score, recall_score, f1_score

# 1. Obtener las probabilidades de predicción
# pipe.classes_ contiene las clases ordenadas [1, 2, 3, 4]. El índice 0 corresponde a "Bajo"
y_proba_bajo = pipe.predict_proba(X_test)[:, 0]

# 2. Crear una etiqueta binaria real: 1 si es "Bajo", 0 en caso contrario
y_test_bajo = (y_test == 1).astype(int)

# 3. Evaluar distintos umbrales
umbrales = [0.20, 0.30, 0.40, 0.50, 0.60, 0.80]
resultados_umbrales = []

for umbral in umbrales:
    # Si la probabilidad es mayor o igual al umbral, predecimos 1 ("Bajo")
    y_pred_umbral = (y_proba_bajo >= umbral).astype(int)
    
    precision = precision_score(y_test_bajo, y_pred_umbral, zero_division=0)
    recall = recall_score(y_test_bajo, y_pred_umbral, zero_division=0)
    f1 = f1_score(y_test_bajo, y_pred_umbral, zero_division=0)
    
    resultados_umbrales.append([umbral, round(precision, 4), round(recall, 4), round(f1, 4)])

# 4. Mostrar resultados en un DataFrame
df_umbrales = pd.DataFrame(resultados_umbrales, columns=["Umbral", "Precisión", "Recall (Sensibilidad)", "F1-Score"])

print("Análisis de Umbrales para clasificar la clase 'Bajo' vs 'Resto':")
print("-" * 75)
print(df_umbrales)

Análisis de Umbrales para clasificar la clase 'Bajo' vs 'Resto':
---------------------------------------------------------------------------
   Umbral  Precisión  Recall (Sensibilidad)  F1-Score
0     0.2     0.5078                 0.8784    0.6436
1     0.3     0.5743                 0.7838    0.6629
2     0.4     0.5882                 0.6757    0.6289
3     0.5     0.7000                 0.5676    0.6269
4     0.6     0.7609                 0.4730    0.5833
5     0.8     0.8889                 0.2162    0.3478

import pandas as pd

# Extraemos el modelo de tu pipeline
modelo_logistico = pipe[-1]

# Extraemos los nombres de las columnas
try:
    nombres_variables = pipe[:-1].get_feature_names_out()
except:
    nombres_variables = X_train.columns

# Creamos la tabla con los coeficientes
df_coeficientes = pd.DataFrame(
    modelo_logistico.coef_, 
    columns=nombres_variables, 
    index=["Bajo", "Medio bajo", "Medio alto", "Alto"]
)

# Mostramos la tabla en pantalla
display(df_coeficientes)

import pandas as pd

# 1. Extraemos el modelo de tu pipeline (asumiendo que se llama 'pipe')
modelo_logistico = pipe[-1]

# 2. Extraemos los nombres de las columnas (variables)
try:
    # Intenta obtener nombres si usaste transformadores que los proveen
    nombres_variables = pipe[:-1].get_feature_names_out()
except:
    # Si falla, asume que son las columnas de X_train original
    nombres_variables = X_train.columns

# 3. Creamos la tabla BASE con los coeficientes
df_raw = pd.DataFrame(
    modelo_logistico.coef_, 
    columns=nombres_variables, 
    index=["Bajo", "Medio bajo", "Medio alto", "Alto"]
)

# ── 4. Modificaciones para visibilidad ─────────────────────────────────────

# a) Transponemos la tabla (Variables en filas, Clases en columnas)
df_visible = df_raw.T

# b) Aplicamos Estilo Condicional: Gradiente de color y formato numérico
#    - 'coolwarm': Azul es negativo, Rojo es positivo, Blanco es cero.
#    - axis=None: El gradiente se calcula considerando TODO el DataFrame, no por columna.
tabla_estilizada = df_visible.style.background_gradient(cmap='coolwarm', axis=None) \
                                   .format("{:.4f}")

# 5. Mostramos la tabla final
display(tabla_estilizada)

	Bajo	Medio bajo	Medio alto	Alto
cocina	0.0491	0.0038	-0.0434	-0.0095
cocina_dor	-0.0780	-0.0044	0.0045	0.0779
num_cuarto	0.2656	-0.2029	0.0215	-0.0843
bano_comp	-0.0012	-0.0121	-0.3245	0.3378
bano_excus	-0.0188	-0.2530	-0.1728	0.4447
bano_regad	-0.1613	0.1414	0.0606	-0.0407
estim_pago	-1.5210	-0.5802	0.5578	1.5434
tot_resid	0.1419	0.1101	-0.0445	-0.2076
tipo_viv_4	-0.1831	-0.0037	0.0292	0.1576
mat_pared_7	-0.4404	-0.0369	0.6665	-0.1892
mat_pared_8	-0.3324	0.0212	0.2661	0.0450
mat_techos_3	0.1885	0.2020	0.2380	-0.6285
mat_techos_4	0.1274	0.0346	-0.1600	-0.0020
mat_techos_6	0.0422	0.0691	-0.1318	0.0205
mat_techos_8	0.1281	0.1086	-0.2315	-0.0052
mat_pisos_3	-0.2783	-0.2886	0.1198	0.4471
lugar_coc_4	0.0780	0.1125	-0.1536	-0.0369
lugar_coc_5	0.0171	0.0911	-0.0834	-0.0248
lugar_coc_6	0.0287	0.0764	0.0113	-0.1164
ab_agua_5	0.0215	0.0534	-0.0929	0.0180
ab_agua_6	0.0637	-0.0672	-0.0100	0.0135
ab_agua_7	0.2049	0.0369	-0.2632	0.0214
dotac_agua_2	0.3164	0.2186	-0.3234	-0.2116
dotac_agua_4	0.1223	0.1640	-0.2491	-0.0371
dotac_agua_5	0.0813	0.1400	0.0848	-0.3061
excusado_2	0.5663	0.4373	-0.5848	-0.4188
excusado_3	-0.0738	-0.0405	0.1380	-0.0237
drenaje_2	0.8792	0.6515	-0.2965	-1.2342
drenaje_5	0.6887	0.2966	-0.8430	-0.1422
disp_elect_4	0.1571	0.1659	-0.2591	-0.0638
disp_elect_5	0.0836	-0.0322	-0.0708	0.0194
combus_3	-0.2621	0.1870	0.1854	-0.1102
combus_4	-0.6374	-0.0653	0.2932	0.4096
combus_7	-0.1963	0.1269	0.1677	-0.0983
eli_basura_2	0.2282	-0.1127	-0.0629	-0.0525
eli_basura_4	1.0282	-0.0774	-0.7699	-0.1809
eli_basura_5	-0.0432	0.0772	-0.0480	0.0140
eli_basura_6	0.1225	0.0510	-0.1143	-0.0593
tenencia_2	-0.0696	0.2249	0.1565	-0.3117
tenencia_4	0.1102	0.0398	0.1462	-0.2962
tenencia_5	-0.1625	0.1238	0.1211	-0.0825
escrituras_4	0.0156	-0.0080	0.0674	-0.0750
escrituras_5	0.0667	-0.3030	-0.1292	0.3655
prop_muj	0.0512	0.0189	-0.0087	-0.0614

A2.1 Regresión logística y validación cruzada¶

Definición del problema de clasificación¶

Separación de datos y balance de clases¶

Evaluación mediante validación cruzada¶

Entrenamiento final y evaluación en prueba¶

Curva ROC y AUC¶

Interpretación del modelo¶