import pandas as pd
import numpy as np

df = pd.read_csv('base_pobreza.csv')
diccionario = pd.read_csv('diccionario_datos.csv')
print(df.shape)

(3767, 31)

print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3767 entries, 0 to 3766
Data columns (total 31 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   folioviv    3767 non-null   int64  
 1   tipo_viv    3767 non-null   object 
 2   mat_pared   3767 non-null   int64  
 3   mat_techos  3767 non-null   int64  
 4   mat_pisos   3767 non-null   int64  
 5   cocina      3767 non-null   int64  
 6   cocina_dor  3662 non-null   float64
 7   cuart_dorm  3767 non-null   int64  
 8   num_cuarto  3767 non-null   int64  
 9   lugar_coc   3767 non-null   int64  
 10  ab_agua     3538 non-null   float64
 11  dotac_agua  3538 non-null   float64
 12  excusado    3767 non-null   int64  
 13  uso_compar  3740 non-null   float64
 14  sanit_agua  3740 non-null   float64
 15  bano_comp   3767 non-null   object 
 16  bano_excus  3767 non-null   object 
 17  bano_regad  3767 non-null   object 
 18  drenaje     3767 non-null   int64  
 19  disp_elect  3767 non-null   int64  
 20  combus      3767 non-null   int64  
 21  eli_basura  3767 non-null   int64  
 22  tenencia    3767 non-null   int64  
 23  estim_pago  3767 non-null   object 
 24  escrituras  2824 non-null   float64
 25  tot_resid   3767 non-null   int64  
 26  tot_hom     3767 non-null   int64  
 27  tot_muj     3767 non-null   int64  
 28  est_socio   3767 non-null   int64  
 29  ing_tri     3767 non-null   float64
 30  ing_tri_pc  3767 non-null   float64
dtypes: float64(8), int64(18), object(5)
memory usage: 912.4+ KB
None

print(diccionario)

                          nombre_campo tipo    nemónico    catálogo  \
0         Identificador de la vivienda    C    folioviv         NaN   
1                     Tipo de vivienda    C    tipo_viv    tipo_viv   
2                  Material de paredes    C   mat_pared   mat_pared   
3                   Material de techos    C  mat_techos  mat_techos   
4                    Material de pisos    C   mat_pisos   mat_pisos   
5                         Tiene cocina    C      cocina       si_no   
6         Utiliza cocina de dormitorio    C  cocina_dor       si_no   
7                   Cuartos dormitorio    N  cuart_dorm         NaN   
8                    Número de cuartos    N  num_cuarto         NaN   
9                  Lugar donde cocinan    C   lugar_coc   lugar_coc   
10              Abastecimiento de agua    C     ab_agua     ab_agua   
11                    Dotación de agua    C  dotac_agua  dotac_agua   
12                      Tiene excusado    C    excusado    excusado   
13        Uso compartido del sanitario    C  uso_compar       si_no   
14             Sanitario conexión agua    C  sanit_agua  sanit_agua   
15        Baño con excusado y regadera    N   bano_comp         NaN   
16              Baño solo con excusado    N  bano_excus         NaN   
17              Baño solo con regadera    N  bano_regad         NaN   
18                  Destino de drenaje    C     drenaje     drenaje   
19            Disponibilidad eléctrica    C  disp_elect  disp_elect   
20                 Tipo de combustible    C      combus      combus   
21               Eliminación de basura    C  eli_basura  eli_basura   
22     Tipo de tenencia de la vivienda    C    tenencia    tenencia   
23        Estimación del pago de renta    N  estim_pago         NaN   
24           Escrituras de la vivienda    C  escrituras  escrituras   
25  Total de residentes de la vivienda    N   tot_resid         NaN   
26         Total de residentes hombres    N     tot_hom         NaN   
27         Total de residentes mujeres    N     tot_muj         NaN   
28              Estrato socioeconómico    C   est_socio   est_socio   
29                  Ingreso trimestral    N     ing_tri         NaN   
30       Ingreso trimestral per capita    N  ing_tri_pc         NaN   

               rango_claves  
0   [0100001901-3260593818]  
1                     [1-7]  
2                     [1-8]  
3                   [01-10]  
4                     [1-3]  
5                     [1-2]  
6                     [1-2]  
7                    [1-99]  
8                    [1-99]  
9                     [1-6]  
10                    [1-7]  
11                    [1-5]  
12                    [1-3]  
13                    [1-2]  
14                    [1-3]  
15                    [0-9]  
16                    [0-9]  
17                    [0-9]  
18                    [1-5]  
19                    [1-5]  
20                    [1-7]  
21                    [1-8]  
22                    [1-6]  
23            [1-999999999]  
24                    [1-4]  
25                   [1-99]  
26                   [1-99]  
27                   [1-99]  
28                    [1-4]  
29   [0.00-999999999999.99]  
30   [0.00-999999999999.99]

print(df.head(5))

     folioviv tipo_viv  mat_pared  mat_techos  mat_pisos  cocina  cocina_dor  \
0  1906156202        2          8          10          2       1         2.0   
1  1906156203        2          8          10          2       1         2.0   
2  1906156204        1          8          10          3       1         2.0   
3  1906156407        1          8          10          2       1         2.0   
4  1906156408        1          7          10          3       1         2.0   

   cuart_dorm  num_cuarto  lugar_coc  ...  eli_basura  tenencia  estim_pago  \
0           2           4          1  ...           1         4        3000   
1           2           4          1  ...           1         1               
2           2           5          1  ...           1         4        3000   
3           1           3          1  ...           1         4        3000   
4           3           4          1  ...           1         1               

   escrituras  tot_resid tot_hom tot_muj est_socio   ing_tri  ing_tri_pc  
0         1.0          2       1       1         2  27890.10  13945.0500  
1         NaN          4       1       3         2  38571.42   9642.8550  
2         1.0          4       1       3         2  59340.65  14835.1625  
3         1.0          2       1       1         2  45244.56  22622.2800  
4         NaN          6       3       3         2  61483.68  10247.2800  

[5 rows x 31 columns]

print(diccionario.loc[[13, 14, 6, 10, 11, 23, 24]])

                    nombre_campo tipo    nemónico    catálogo   rango_claves
13  Uso compartido del sanitario    C  uso_compar       si_no          [1-2]
14       Sanitario conexión agua    C  sanit_agua  sanit_agua          [1-3]
6   Utiliza cocina de dormitorio    C  cocina_dor       si_no          [1-2]
10        Abastecimiento de agua    C     ab_agua     ab_agua          [1-7]
11              Dotación de agua    C  dotac_agua  dotac_agua          [1-5]
23  Estimación del pago de renta    N  estim_pago         NaN  [1-999999999]
24     Escrituras de la vivienda    C  escrituras  escrituras          [1-4]

## Imputacion por moda para variables marginales (no se toca 'escrituras' ni 'estim_pago' porque se tratara aparte)

# Columnas con NA (excluyendo 'escrituras' porque se tratara aparte)
na_cols = [c for c in df.columns if df[c].isna().any() and c != "escrituras" and c != "estim_pago"]

print("Columnas con NA (excepto escrituras):", na_cols)

for c in na_cols:
    mode_series = df[c].mode(dropna=True)
    if mode_series.empty:
        # Si por alguna razon no hay moda (columna vacia), se omite
        print(f"[WARN] {c}: no se encontro moda. Se omite.")
        continue
    
    mode_value = mode_series.iloc[0]
    df[c] = df[c].fillna(mode_value)

print("\nImputacion por moda aplicada a variables marginales.")

Columnas con NA (excepto escrituras): ['cocina_dor', 'ab_agua', 'dotac_agua', 'uso_compar', 'sanit_agua']

Imputacion por moda aplicada a variables marginales.

nulos_antes = df["escrituras"].isna().sum()
print(f"Nulos en 'escrituras' antes de imputar: {nulos_antes}")

Nulos en 'escrituras' antes de imputar: 943

## Imputacion por valor fijo para 'escrituras' (categoría adicional: 5 = "No respondio (faltante original)")

df["escrituras"] = df["escrituras"].fillna(5)
print("Imputacion por valor fijo aplicada a 'escrituras'.")

Imputacion por valor fijo aplicada a 'escrituras'.

## Imputación por mediana para estim_pago (si es numérica, si no se puede convertir a numérica se omite)

# Normaliza a string para limpiar
s = df["estim_pago"].astype(str).str.strip()

# Convierte placeholders comunes a NaN
placeholders = {"", "NA", "N/A", "NULL", "null", "None", "nan", "NaN", "Sin dato", "No sabe"}
s = s.replace(list(placeholders), np.nan)

# Convierte a numerico (lo que no se pueda se vuelve NaN)
df["estim_pago"] = pd.to_numeric(s, errors="coerce")

global_median = df["estim_pago"].median(skipna=True)

# Fallback final a mediana global
df["estim_pago"] = df["estim_pago"].fillna(global_median)

print(f"Imputacion por mediana ({global_median}) aplicada a estim_pago.")

Imputacion por mediana (2500.0) aplicada a estim_pago.

print(diccionario.loc[[15, 16, 17]])

                    nombre_campo tipo    nemónico catálogo rango_claves
15  Baño con excusado y regadera    N   bano_comp      NaN        [0-9]
16        Baño solo con excusado    N  bano_excus      NaN        [0-9]
17        Baño solo con regadera    N  bano_regad      NaN        [0-9]

print("Frecuencias:")
print(df["tipo_viv"].value_counts(dropna=False).sort_index())

Frecuencias:
tipo_viv
&       3
1    3016
2     590
3      54
4      22
5      24
7      58
Name: count, dtype: int64

# 1) Limpiar el caracter raro
df["tipo_viv"] = df["tipo_viv"].astype(str).str.strip()
df.loc[df["tipo_viv"] == "&", "tipo_viv"] = np.nan   # convierte "&" a faltante real

# 2) Convertir de forma segura a entero categórico
df["tipo_viv"] = pd.to_numeric(df["tipo_viv"], errors="coerce").astype("Int64").astype("category")

# 3) Imputar por moda
if df["tipo_viv"].isna().any():
    moda = df["tipo_viv"].mode(dropna=True).iloc[0]
    df["tipo_viv"] = df["tipo_viv"].fillna(moda)

print("\nImputacion por moda aplicada a variable 'tipo_viv'.")

Imputacion por moda aplicada a variable 'tipo_viv'.

obj_clase = ["bano_comp", "bano_excus", "bano_regad"]

for c in obj_clase:
    # suponiendo que tienes la columna en tu df
    s = df[c].astype(str)
    # limpiar blancos (si vinieron como texto)
    s = s.astype(str).str.strip().replace("", np.nan)

    print("Frecuencias:")
    print(s.value_counts(dropna=False).sort_index(), "\n")

Frecuencias:
bano_comp
0       603
1      2390
2       618
3       104
4        20
5         5
NaN      27
Name: count, dtype: int64 

Frecuencias:
bano_excus
0      2408
1      1242
2        85
3         5
NaN      27
Name: count, dtype: int64 

Frecuencias:
bano_regad
0      3585
1       148
2         6
6         1
NaN      27
Name: count, dtype: int64

## Imputación por mediana para variables de bano_comp, bano_excus, bano_regad 
for c in obj_clase:
    
    # 1) Convertir a string y limpiar espacios
    s = df[c].astype(str).str.strip()
    
    # 2) Reemplazar blancos por NaN reales
    s = s.replace("", np.nan)
    
    # 3) Convertir a numerico (lo no numerico -> NaN)
    s_num = pd.to_numeric(s, errors="coerce")
    
    # 4) Imputacion por mediana (si hay NaN)
    if s_num.isna().any():
        mediana = s_num.median(skipna=True)
        s_num = s_num.fillna(mediana)
        print(f"\nImputacion por mediana ({mediana}) aplicada a {c}")  
    
    # 5) Actualizar columna en el dataframe como int64
    df[c] = s_num.astype("int64")

# Verificacion final
print("\n")
print(df[obj_clase].info())

Imputacion por mediana (1.0) aplicada a bano_comp

Imputacion por mediana (0.0) aplicada a bano_excus

Imputacion por mediana (0.0) aplicada a bano_regad


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3767 entries, 0 to 3766
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype
---  ------      --------------  -----
 0   bano_comp   3767 non-null   int64
 1   bano_excus  3767 non-null   int64
 2   bano_regad  3767 non-null   int64
dtypes: int64(3)
memory usage: 88.4 KB
None

cat_cols = [
    "tipo_viv", "mat_pared", "mat_techos", "mat_pisos", "lugar_coc",
    "ab_agua", "dotac_agua", "excusado", "sanit_agua", "drenaje",
    "disp_elect", "combus", "eli_basura", "tenencia", "escrituras"
]

# Asegurar que existan en el dataframe (por si alguna no esta)
cat_cols = [c for c in cat_cols if c in df.columns]

# 1) Diagnostico: encontrar valores "raros" no numericos
for c in cat_cols:
    s = df[c].astype(str).str.strip()
    # valores que NO son numeros (permitiendo vacio y NaN)
    bad_mask = ~s.str.match(r"^-?\d+(\.0+)?$") & ~s.isin(["nan", "NaN", "None", ""])  # tolera NaN/vacios
    if bad_mask.any():
        print(f"\n[WARN] Valores no numericos detectados en {c}:")
        print(s[bad_mask].value_counts().head(10))

# 2) Conversion segura: lo no numerico -> NaN
for c in cat_cols:
    df[c] = pd.to_numeric(df[c], errors="coerce").astype("Int64").astype("category")

# drop_first=True evita multicolinealidad en modelos lineales
df_dum = pd.get_dummies(df, columns=cat_cols, drop_first=True)

print("Columnas originales:", df.shape[1])
print("Columnas despues de dummies:", df_dum.shape[1])
print("\nColumnas dummy creadas:")
print(df_dum.columns)

Columnas originales: 31
Columnas despues de dummies: 87

Columnas dummy creadas:
Index(['folioviv', 'cocina', 'cocina_dor', 'cuart_dorm', 'num_cuarto',
       'uso_compar', 'bano_comp', 'bano_excus', 'bano_regad', 'estim_pago',
       'tot_resid', 'tot_hom', 'tot_muj', 'est_socio', 'ing_tri', 'ing_tri_pc',
       'tipo_viv_2', 'tipo_viv_3', 'tipo_viv_4', 'tipo_viv_5', 'tipo_viv_7',
       'mat_pared_2', 'mat_pared_3', 'mat_pared_4', 'mat_pared_5',
       'mat_pared_6', 'mat_pared_7', 'mat_pared_8', 'mat_techos_2',
       'mat_techos_3', 'mat_techos_4', 'mat_techos_5', 'mat_techos_6',
       'mat_techos_7', 'mat_techos_8', 'mat_techos_9', 'mat_techos_10',
       'mat_pisos_2', 'mat_pisos_3', 'lugar_coc_2', 'lugar_coc_3',
       'lugar_coc_4', 'lugar_coc_5', 'lugar_coc_6', 'ab_agua_2', 'ab_agua_3',
       'ab_agua_4', 'ab_agua_5', 'ab_agua_6', 'ab_agua_7', 'dotac_agua_2',
       'dotac_agua_3', 'dotac_agua_4', 'dotac_agua_5', 'excusado_2',
       'excusado_3', 'sanit_agua_2', 'sanit_agua_3', 'drenaje_2', 'drenaje_3',
       'drenaje_4', 'drenaje_5', 'disp_elect_2', 'disp_elect_3',
       'disp_elect_4', 'disp_elect_5', 'combus_2', 'combus_3', 'combus_4',
       'combus_5', 'combus_6', 'combus_7', 'eli_basura_2', 'eli_basura_3',
       'eli_basura_4', 'eli_basura_5', 'eli_basura_6', 'eli_basura_7',
       'tenencia_2', 'tenencia_3', 'tenencia_4', 'tenencia_5', 'tenencia_6',
       'escrituras_2', 'escrituras_3', 'escrituras_4', 'escrituras_5'],
      dtype='object')

COL = "ing_tri_pc" 

x = pd.to_numeric(df_dum[COL], errors="coerce")

print("\nTop 10 valores más altos:")
print(x.sort_values(ascending=False).head(10).to_string(index=False))

print("\nTop 10 valores más bajos:")
print(x.sort_values(ascending=True).head(10).to_string(index=False))

Top 10 valores más altos:
8686956.5150
 653736.2580
 574092.3840
 424599.5500
 392090.1625
 372502.1700
 340862.7675
 296796.1900
 294889.4500
 277868.8300

Top 10 valores más bajos:
  0.000000
  0.000000
  0.000000
  0.000000
  0.000000
  0.000000
  0.000000
  0.000000
146.735000
277.173333

import matplotlib.pyplot as plt

TARGET_COL = "ing_tri_pc"

y_raw = pd.to_numeric(df_dum[TARGET_COL], errors="coerce")
y = y_raw.dropna().copy()

plt.figure()
plt.hist(y, bins=60)
plt.title(f"Histograma (crudo): {TARGET_COL}")
plt.xlabel(TARGET_COL)
plt.ylabel("Frecuencia")
plt.tight_layout()
plt.show()

y_log = np.log1p(y)

plt.figure()
plt.hist(y_log, bins=60)
plt.title(f"Histograma (transformación): log1p({TARGET_COL})")
plt.xlabel(f"log1p({TARGET_COL})")
plt.ylabel("Frecuencia")
plt.tight_layout()
plt.show()

# Almacenamos la variable transformada en el dataframe final (con NaN donde no se pudo calcular)
df_dum["y_log_ing_tri_pc"] = np.log1p(pd.to_numeric(df_dum[TARGET_COL], errors="coerce")).where(df_dum[TARGET_COL].notna(), np.nan)

# Target robusto (evita NaN/-inf por negativos)
y_raw = pd.to_numeric(df["ing_tri_pc"], errors="coerce")

# Opcion A (conservadora): filtra valores invalidos
mask = y_raw.notna() & (y_raw >= 0)
df_model = df.loc[mask].copy()

df_dum["prop_muj"] = df_dum["tot_muj"] / df_dum["tot_resid"].replace(0, np.nan)
print("Variable 'prop_muj' creada como tot_muj / tot_resid (con NaN donde tot_resid=0).")

Variable 'prop_muj' creada como tot_muj / tot_resid (con NaN donde tot_resid=0).

from sklearn.linear_model import LassoCV, LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# ─── 1. Separación de variables──────────────────────────────────────────────────

# Separar la variable objetivo y las características
X = df_dum.drop(columns=[
    "ing_tri", "ing_tri_pc", "y_log_ing_tri_pc", "folioviv",
    "est_socio",
    "tot_hom",    # ← eliminar: tot_resid = tot_hom + tot_muj (redundante)
    "tot_muj",    # si usamos prop_muj en su lugar
])

y = df_dum["y_log_ing_tri_pc"] # Variable objetivo transformada (log1p)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# ─── 2. Escalado (fit solo en train) ───────────────────────────────────────────
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)   # ✅ fit_transform en train
X_test_scaled  = scaler.transform(X_test)         # ✅ solo transform en test

# ─── 3. LassoCV con reproducibilidad ───────────────────────────────────────────
# KFold con shuffle y semilla fija
kf = KFold(n_splits=5, shuffle=True, random_state=42)

lasso_cv = LassoCV(
    alphas=np.logspace(-4, 1, 100),  # grilla explícita de 100 valores log-uniformes
    cv=kf,
    max_iter=10_000,
    random_state=42
)
lasso_cv.fit(X_train_scaled, y_train)

print(f"Mejor alpha (λ) seleccionado por LassoCV: {lasso_cv.alpha_:.6f}")

# ─── 4. Features seleccionadas ─────────────────────────────────────────────────
coef_dict = dict(zip(X.columns, lasso_cv.coef_))
selected_features = [f for f, c in coef_dict.items() if c != 0]

print(f"\n{len(selected_features)} features seleccionadas (coef ≠ 0):")
for i, (feat, coef) in enumerate(
    sorted(((f,c) for f,c in coef_dict.items() if c != 0),
           key=lambda x: abs(x[1]), reverse=True), 1
):
    print(f"  {i:2d}. {feat:<30s}: {coef:+.6f}")

Mejor alpha (λ) seleccionado por LassoCV: 0.011768

44 features seleccionadas (coef ≠ 0):
   1. tot_resid                     : -0.187352
   2. estim_pago                    : +0.135645
   3. combus_4                      : +0.131302
   4. excusado_2                    : -0.114154
   5. eli_basura_4                  : -0.104893
   6. prop_muj                      : -0.101682
   7. bano_comp                     : +0.092576
   8. mat_pisos_3                   : +0.070092
   9. combus_3                      : +0.058284
  10. tenencia_2                    : -0.057514
  11. lugar_coc_5                   : -0.052648
  12. bano_excus                    : +0.043870
  13. ab_agua_7                     : -0.042996
  14. drenaje_2                     : -0.040827
  15. bano_regad                    : +0.040337
  16. tenencia_4                    : -0.040030
  17. eli_basura_5                  : -0.038023
  18. mat_pared_7                   : -0.037523
  19. excusado_3                    : -0.033850
  20. escrituras_4                  : -0.033687
  21. tenencia_5                    : -0.027513
  22. cocina                        : -0.025653
  23. tipo_viv_4                    : +0.024601
  24. dotac_agua_5                  : -0.023945
  25. lugar_coc_4                   : -0.023645
  26. eli_basura_2                  : -0.023043
  27. escrituras_5                  : +0.021170
  28. combus_7                      : -0.016348
  29. num_cuarto                    : +0.015657
  30. mat_pared_8                   : +0.013100
  31. ab_agua_6                     : -0.012841
  32. disp_elect_5                  : +0.011986
  33. drenaje_5                     : -0.011564
  34. dotac_agua_4                  : +0.011209
  35. cocina_dor                    : +0.010409
  36. mat_techos_6                  : +0.009847
  37. lugar_coc_6                   : -0.006247
  38. mat_techos_8                  : +0.004622
  39. mat_techos_4                  : -0.002900
  40. dotac_agua_2                  : -0.002550
  41. mat_techos_3                  : -0.002073
  42. ab_agua_5                     : -0.001869
  43. disp_elect_4                  : +0.001737
  44. eli_basura_6                  : -0.001155

import statsmodels.api as sm

# ─── 5. Modelo de regresión con train set───────────────────────────────────────

X_train_sel = X_train[selected_features].astype(float)
X_train_sm = sm.add_constant(X_train_sel)   # agrega intercepto β₀

model = sm.OLS(y_train,X_train_sm)
results = model.fit()
print(results.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:       y_log_ing_tri_pc   R-squared:                       0.394
Model:                            OLS   Adj. R-squared:                  0.385
Method:                 Least Squares   F-statistic:                     43.94
Date:                Thu, 19 Feb 2026   Prob (F-statistic):          1.65e-285
Time:                        13:24:56   Log-Likelihood:                -3408.1
No. Observations:                3013   AIC:                             6906.
Df Residuals:                    2968   BIC:                             7177.
Df Model:                          44                                         
Covariance Type:            nonrobust                                         
================================================================================
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
const            9.8022      0.212     46.178      0.000       9.386      10.218
cocina          -0.1548      0.103     -1.507      0.132      -0.356       0.047
cocina_dor       0.1149      0.070      1.638      0.102      -0.023       0.252
num_cuarto       0.0154      0.012      1.239      0.216      -0.009       0.040
bano_comp        0.1411      0.030      4.659      0.000       0.082       0.201
bano_excus       0.1134      0.031      3.687      0.000       0.053       0.174
bano_regad       0.2389      0.063      3.777      0.000       0.115       0.363
estim_pago    3.528e-05   4.76e-06      7.410      0.000    2.59e-05    4.46e-05
tot_resid       -0.1173      0.008    -14.095      0.000      -0.134      -0.101
tipo_viv_4       0.4896      0.192      2.552      0.011       0.113       0.866
mat_pared_7     -0.1818      0.113     -1.610      0.108      -0.403       0.040
mat_pared_8      0.0093      0.104      0.089      0.929      -0.195       0.214
mat_techos_3     0.0044      0.044      0.099      0.921      -0.083       0.092
mat_techos_4    -0.2834      0.289     -0.979      0.328      -0.851       0.284
mat_techos_6     0.5353      0.313      1.710      0.087      -0.078       1.149
mat_techos_8     0.2765      0.171      1.622      0.105      -0.058       0.611
mat_pisos_3      0.1296      0.034      3.759      0.000       0.062       0.197
lugar_coc_4     -0.4425      0.195     -2.266      0.024      -0.826      -0.060
lugar_coc_5     -1.0124      0.245     -4.129      0.000      -1.493      -0.532
lugar_coc_6     -0.1478      0.280     -0.528      0.597      -0.696       0.401
ab_agua_5       -0.2721      0.316     -0.862      0.389      -0.891       0.347
ab_agua_6       -0.8077      0.543     -1.486      0.137      -1.873       0.258
ab_agua_7       -0.4420      0.123     -3.585      0.000      -0.684      -0.200
dotac_agua_2    -0.0425      0.063     -0.670      0.503      -0.167       0.082
dotac_agua_4     0.2350      0.137      1.712      0.087      -0.034       0.504
dotac_agua_5    -0.2025      0.092     -2.191      0.029      -0.384      -0.021
excusado_2      -0.3579      0.073     -4.897      0.000      -0.501      -0.215
excusado_3      -0.4401      0.180     -2.444      0.015      -0.793      -0.087
drenaje_2       -0.1021      0.044     -2.340      0.019      -0.188      -0.017
drenaje_5       -0.0619      0.085     -0.729      0.466      -0.228       0.105
disp_elect_4     0.1868      0.144      1.294      0.196      -0.096       0.470
disp_elect_5     0.5448      0.230      2.370      0.018       0.094       0.995
combus_3         0.2621      0.058      4.556      0.000       0.149       0.375
combus_4         0.4225      0.067      6.260      0.000       0.290       0.555
combus_7        -0.3058      0.284     -1.075      0.282      -0.864       0.252
eli_basura_2    -0.2084      0.087     -2.388      0.017      -0.380      -0.037
eli_basura_4    -0.2555      0.054     -4.742      0.000      -0.361      -0.150
eli_basura_5    -1.8565      0.538     -3.452      0.001      -2.911      -0.802
eli_basura_6    -0.2042      0.298     -0.686      0.493      -0.788       0.380
tenencia_2      -0.2986      0.060     -4.988      0.000      -0.416      -0.181
tenencia_4      -0.0887      0.044     -2.010      0.045      -0.175      -0.002
tenencia_5      -0.3543      0.102     -3.459      0.001      -0.555      -0.153
escrituras_4    -0.3649      0.121     -3.027      0.002      -0.601      -0.129
escrituras_5     0.1273      0.053      2.394      0.017       0.023       0.232
prop_muj        -0.4528      0.055     -8.216      0.000      -0.561      -0.345
==============================================================================
Omnibus:                     2540.652   Durbin-Watson:                   1.952
Prob(Omnibus):                  0.000   Jarque-Bera (JB):           211281.019
Skew:                          -3.504   Prob(JB):                         0.00
Kurtosis:                      43.421   Cond. No.                     2.06e+05
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 2.06e+05. This might indicate that there are
strong multicollinearity or other numerical problems.

from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import RidgeCV
from sklearn.pipeline import make_pipeline

# ─── Datos: solo features seleccionadas por LASSO (arrays numpy) ─────────────
X_tr_sel = X_train[selected_features].astype(float).values
X_te_sel = X_test[selected_features].astype(float).values

# ─── Pipeline: Escalado → PolynomialFeatures(g=2) → RidgeCV ─────────────────
alphas_ridge = np.logspace(-2, 6, 100)   # grilla amplia para Ridge

pipe_poly = make_pipeline(
    StandardScaler(),
    PolynomialFeatures(degree=2, include_bias=False),
    RidgeCV(alphas=alphas_ridge, cv=kf, scoring='r2')
)

pipe_poly.fit(X_tr_sel, y_train)

ridge_model  = pipe_poly.named_steps['ridgecv']
n_poly_feats = pipe_poly.named_steps['polynomialfeatures'].n_output_features_

print(f"Features originales (LASSO):         {len(selected_features)}")
print(f"Features polinomiales (grado 2):     {n_poly_feats}")
print(f"Mejor alpha Ridge (validación cruzada): {ridge_model.alpha_:.4f}")

# ─── Nombres de features polinomiales ────────────────────────────────────────
poly_step    = pipe_poly.named_steps['polynomialfeatures']
scaler_step  = pipe_poly.named_steps['standardscaler']
ridge_step   = pipe_poly.named_steps['ridgecv']

# Nombres originales → PolynomialFeatures genera nombres automáticamente
poly_feature_names = poly_step.get_feature_names_out(selected_features)

# ─── Coeficientes Ridge ───────────────────────────────────────────────────────
coef_ridge = ridge_step.coef_   # array de longitud n_poly_feats

df_ridge_coef = (
    pd.DataFrame({
        "feature": poly_feature_names,
        "coeficiente": coef_ridge
    })
    .assign(abs_coef=lambda d: d["coeficiente"].abs())
    .sort_values("abs_coef", ascending=False)
    .drop(columns="abs_coef")
    .reset_index(drop=True)
)

print(f"Total de coeficientes: {len(df_ridge_coef)}")
print(f"\nTop 20 coeficientes (|β| más alto):")
print(df_ridge_coef.head(20).to_string(index=False))

print(f"\nBottom 10 coeficientes (|β| más bajo):")
print(df_ridge_coef.tail(10).to_string(index=False))

Features originales (LASSO):         44
Features polinomiales (grado 2):     1034
Mejor alpha Ridge (validación cruzada): 6579.3322
Total de coeficientes: 1034

Top 20 coeficientes (|β| más alto):
                  feature  coeficiente
                tot_resid    -0.054721
                 prop_muj    -0.030121
           eli_basura_4^2    -0.029046
             excusado_2^2    -0.028530
              mat_pisos_3     0.028248
dotac_agua_5 escrituras_4    -0.026969
               estim_pago     0.026161
 lugar_coc_5 eli_basura_2    -0.023649
                bano_comp     0.022883
       tot_resid prop_muj     0.022732
 lugar_coc_4 dotac_agua_5    -0.021642
               prop_muj^2     0.021546
                drenaje_2    -0.020234
        combus_3 combus_4    -0.020141
                 combus_4     0.019887
     lugar_coc_4 prop_muj     0.019578
             eli_basura_4    -0.017765
   ab_agua_7 dotac_agua_4    -0.017693
              drenaje_2^2    -0.017623
               combus_4^2     0.017356

Bottom 10 coeficientes (|β| más bajo):
                  feature   coeficiente
 lugar_coc_6 eli_basura_5  8.448771e-07
   ab_agua_6 dotac_agua_4 -8.182888e-07
    mat_techos_6 combus_7 -7.992915e-07
   mat_techos_6 ab_agua_5  7.549651e-07
     lugar_coc_5 combus_7  7.543087e-07
   ab_agua_5 dotac_agua_4 -4.560253e-07
 mat_techos_4 lugar_coc_6  3.998149e-07
 lugar_coc_5 disp_elect_4  2.397286e-07
   mat_techos_6 ab_agua_6  1.962324e-07
mat_techos_6 eli_basura_6 -1.168000e-08

# ─── Frecuencia de categorías raras en el conjunto de entrenamiento ───────────
raras = ["mat_techos_6", "ab_agua_6", "lugar_coc_6"]

print(f"{'Variable':<20} {'Valor=1 (train)':<18} {'% del train'}")
print("─" * 52)
for var in raras:
    if var in X_train.columns:
        n     = X_train[var].sum()
        pct   = n / len(X_train) * 100
        print(f"{var:<20} {int(n):<18} {pct:.2f}%")
    else:
        print(f"{var:<20} — no encontrada en X_train")

Variable             Valor=1 (train)    % del train
────────────────────────────────────────────────────
mat_techos_6         6                  0.20%
ab_agua_6            2                  0.07%
lugar_coc_6          14                 0.46%

# ─── 6. Evaluación completa ───────────────────────────────────────────────────
LPI_TRIM = 13_920.48

def evaluar_modelo(nombre, y_true_log, y_pred_log):
    """Calcula y muestra métricas en escala log y escala original (MXN)."""
    y_true_orig = np.expm1(y_true_log)
    y_pred_orig = np.expm1(y_pred_log)

    r2   = r2_score(y_true_log, y_pred_log)
    rmse = np.sqrt(mean_squared_error(y_true_orig, y_pred_orig))
    mae  = mean_absolute_error(y_true_orig, y_pred_orig)
    # MAPE con protección contra ceros
    mape = np.mean(np.abs((y_true_orig - y_pred_orig) / np.where(y_true_orig == 0, np.nan, y_true_orig))) * 100

    print(f"\n{'─'*50}")
    print(f"  {nombre}")
    print(f"{'─'*50}")
    print(f"  R² (log scale):             {r2:.4f}")
    print(f"  RMSE (MXN):  \t\t{rmse:>12,.2f}")
    print(f"  MAE  (MXN):  \t\t{mae:>12,.2f}")
    print(f"  MAPE (%):  \t\t{mape:>11.2f}%")
    print(f"  RMSE / LPI trimestral:      {rmse/LPI_TRIM*100:.1f}%")
    return {"R2": r2, "RMSE": rmse, "MAE": mae, "MAPE": mape}

# ─── Predicción en test ───────────────────────────────────────────────────────
X_test_sel = X_test[selected_features].astype(float)
X_test_sm  = sm.add_constant(X_test_sel)   # ← corrección: misma estructura que en el ajuste

y_pred_ols   = results.predict(X_test_sm)          # OLS (features seleccionadas)

m_ols   = evaluar_modelo("OLS sklearn (LASSO features)", y_test, y_pred_ols)

──────────────────────────────────────────────────
  OLS sklearn (LASSO features)
──────────────────────────────────────────────────
  R² (log scale):             0.3212
  RMSE (MXN):  		   27,024.86
  MAE  (MXN):  		   12,065.71
  MAPE (%):  		      69.26%
  RMSE / LPI trimestral:      194.1%

# ─── Predicción en test ───────────────────────────────────────────────────────
y_pred_poly = pipe_poly.predict(X_te_sel)

m_poly = evaluar_modelo(
    "Regresión Polinomial grado 2 + Ridge",
    y_test, y_pred_poly
)

──────────────────────────────────────────────────
  Regresión Polinomial grado 2 + Ridge
──────────────────────────────────────────────────
  R² (log scale):             0.2854
  RMSE (MXN):  		   27,964.91
  MAE  (MXN):  		   12,506.99
  MAPE (%):  		      70.00%
  RMSE / LPI trimestral:      200.9%

# ─── Tabla comparativa ────────────────────────────────────────────────────────
resultados = pd.DataFrame({
    "Modelo":    ["OLS — Lineal", "Polinomial g=2 + Ridge"],
    "R² (log)": [m_ols["R2"],   m_poly["R2"]],
    "RMSE (MXN)": [m_ols["RMSE"], m_poly["RMSE"]],
    "MAE (MXN)": [m_ols["MAE"],  m_poly["MAE"]],
    "MAPE (%)": [m_ols["MAPE"], m_poly["MAPE"]],
}).set_index("Modelo")

mejor = resultados["R² (log)"].idxmax()
print(f"✓ Mejor modelo por R²: {mejor}")
mejor_rmse = resultados["RMSE (MXN)"].idxmin()
print(f"✓ Mejor modelo por RMSE: {mejor_rmse}")

# ─── Gráfico: predicciones vs. valores reales ─────────────────────────────────
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

y_test_arr = np.array(y_test)

for ax, (y_pred, titulo) in zip(axes, [
    (np.array(y_pred_ols),  "OLS — Regresión Lineal"),
    (y_pred_poly,           "Polinomial grado 2 + Ridge"),
]):
    ax.scatter(y_test_arr, y_pred, alpha=0.25, s=10, color='steelblue', edgecolors='none')
    lo = min(y_test_arr.min(), y_pred.min())
    hi = max(y_test_arr.max(), y_pred.max())
    ax.plot([lo, hi], [lo, hi], 'r--', lw=1.5, label='predicción perfecta')
    r2_val = resultados.loc[
        "OLS — Lineal" if "Lineal" in titulo else "Polinomial g=2 + Ridge",
        "R² (log)"
    ]
    ax.set_xlabel("log1p(ing_tri_pc) real", fontsize=11)
    ax.set_ylabel("log1p(ing_tri_pc) predicho", fontsize=11)
    ax.set_title(f"{titulo}\nR² = {r2_val:.4f}", fontsize=12)
    ax.set_xlim(lo, hi); ax.set_ylim(lo, hi)
    ax.legend(fontsize=9)

✓ Mejor modelo por R²: OLS — Lineal
✓ Mejor modelo por RMSE: OLS — Lineal

Proyecto Final – Unidad 1¶

1) Planteamiento del problema y contexto de los datos¶

2) Exploración y comprension del dataset¶

3) Preparación y tratamiento de datos¶

Problema A: valores faltantes¶

Problema B: variables categóricas codificadas como números¶

Problema C: distribución sesgada y outliers en ingreso¶

4) Selección de caracteristicas¶

5) Construcción y comparación de modelos¶

A. Modelo lineal (para interpretacion)¶

B. Modelo no lineal (para desempeño)¶

6) Evaluacion del desempeño¶

7) Inferencia y conclusiones¶

8) Referencias¶