# Importar las librerías.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split # Permite implementar la mayoría de modelos.
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier
import statsmodels.formula.api as smf

# Cargar los datos.
url = 'https://raw.githubusercontent.com/estefaniadelarosa/IA-I/refs/heads/main/P1.%20Regresi%C3%B3n/P1.%20Regresi%C3%B3n/2024_2025_salud_mental.csv'
df = pd.read_csv(url)
print(df.shape)
df.head()

(48224, 14)

df1 = df[df['municipio_unidad_medica'] == 'SAN PEDRO GARZA GARCIA']
print(df1.shape)
df1.head()

(1060, 14)

# Aplicar el filtro de personas dentro del rango de edad de 18 a 65 años.
df2 = df1[(df1['edad'] >= 18) & (df1['edad'] <= 65)]
print(df2.shape)
df2.head()

(720, 14)

# Convertimos las variables categóricas a categóricas numéricas.
from sklearn.preprocessing import LabelEncoder 

# Aplicamos la transformación.
df2['sexo_num'] = LabelEncoder().fit_transform(df2['sexo'])
df2['institucion_unidad_medica_num'] = LabelEncoder().fit_transform(df2['institucion_unidad_medica'])
df2['descripcion_grupo_enfermedad_num'] = LabelEncoder().fit_transform(df2['descripcion_grupo_enfermedad'])
df2['clave_enfermedad_num'] = LabelEncoder().fit_transform(df2['clave_enfermedad'])
df2['descripcion_enfermedad_num'] = LabelEncoder().fit_transform(df2['descripcion_enfermedad'])

df2.sample(5)

/var/folders/rw/krrrqrzn68j3jq_d4yl8mzk80000gn/T/ipykernel_10700/763801538.py:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2['sexo_num'] = LabelEncoder().fit_transform(df2['sexo'])
/var/folders/rw/krrrqrzn68j3jq_d4yl8mzk80000gn/T/ipykernel_10700/763801538.py:6: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2['institucion_unidad_medica_num'] = LabelEncoder().fit_transform(df2['institucion_unidad_medica'])
/var/folders/rw/krrrqrzn68j3jq_d4yl8mzk80000gn/T/ipykernel_10700/763801538.py:7: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2['descripcion_grupo_enfermedad_num'] = LabelEncoder().fit_transform(df2['descripcion_grupo_enfermedad'])
/var/folders/rw/krrrqrzn68j3jq_d4yl8mzk80000gn/T/ipykernel_10700/763801538.py:8: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2['clave_enfermedad_num'] = LabelEncoder().fit_transform(df2['clave_enfermedad'])
/var/folders/rw/krrrqrzn68j3jq_d4yl8mzk80000gn/T/ipykernel_10700/763801538.py:9: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2['descripcion_enfermedad_num'] = LabelEncoder().fit_transform(df2['descripcion_enfermedad'])

df2 = df2.drop(columns=['edad_meses', 'edad_dias'])

# Convertir peso y altura a enteros.
df2['peso'] = df2['peso'].astype(float)
df2['altura'] = df2['altura'].astype(float)

# Verificar.
print(df2.dtypes[['peso','altura']])

peso      float64
altura    float64
dtype: object

# Convertir altura a metros.
df2['altura_m'] = df2['altura'] / 100
# IMC.
df2['IMC'] = df2['peso'] / (df2['altura_m']**2)

# Definimos entrada y salida.
X = df2[['edad', 'sexo_num', 'institucion_unidad_medica_num']]
y = df2['descripcion_grupo_enfermedad_num']
print(X.shape)
print(y.shape)

(720, 3)
(720,)

df2['descripcion_grupo_enfermedad'].value_counts()

descripcion_grupo_enfermedad
TRASTORNOS MENTALES Y DEL COMPORTAMIENTO                                             547
FACTORES QUE INFLUYEN EN EL ESTADO DE SALUD Y CONTACTO CON LOS SERVICIOS DE SALUD    173
Name: count, dtype: int64

# Para el porcentaje.
df2['descripcion_grupo_enfermedad'].value_counts()/(df2['descripcion_grupo_enfermedad'].value_counts().sum())*100

descripcion_grupo_enfermedad
TRASTORNOS MENTALES Y DEL COMPORTAMIENTO                                             75.972222
FACTORES QUE INFLUYEN EN EL ESTADO DE SALUD Y CONTACTO CON LOS SERVICIOS DE SALUD    24.027778
Name: count, dtype: float64

# Dividir los tratos en train y test.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify = y, random_state = 0)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((576, 3), (144, 3), (576,), (144,))

y_train.value_counts()

# 0: Factores influyendo en el estado de salud. 1: Trastorno mental y del comportamiento.

descripcion_grupo_enfermedad_num
1    438
0    138
Name: count, dtype: int64

from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state = 0)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

print(y_train_smote.value_counts())

# 0: Factores influyendo en el estado de salud. 1: Trastorno mental y del comportamiento.

descripcion_grupo_enfermedad_num
0    438
1    438
Name: count, dtype: int64

le = LabelEncoder()
y_encoded = le.fit_transform(y)
print(y_encoded.shape)

(720,)

# Create a pair plot to visualize relationships between different features and species.
ax = sns.pairplot(df2, hue = 'descripcion_grupo_enfermedad_num', markers = ["o", "s", "D"])
plt.suptitle("Pair Plot: Grupos de Enfermedades.")
sns.move_legend(
    ax, "lower center",
    bbox_to_anchor = (.5, 1), ncol = 3, title = None, frameon = False)
plt.tight_layout()
plt.show()

/opt/anaconda3/lib/python3.13/site-packages/seaborn/axisgrid.py:1615: UserWarning: The markers list has more values (3) than needed (2), which may not be intended.
  func(x=x, y=y, **kwargs)
/opt/anaconda3/lib/python3.13/site-packages/seaborn/axisgrid.py:1615: UserWarning: The markers list has more values (3) than needed (2), which may not be intended.
  func(x=x, y=y, **kwargs)
/opt/anaconda3/lib/python3.13/site-packages/seaborn/axisgrid.py:1615: UserWarning: The markers list has more values (3) than needed (2), which may not be intended.
  func(x=x, y=y, **kwargs)
/opt/anaconda3/lib/python3.13/site-packages/seaborn/axisgrid.py:1615: UserWarning: The markers list has more values (3) than needed (2), which may not be intended.
  func(x=x, y=y, **kwargs)
/opt/anaconda3/lib/python3.13/site-packages/seaborn/axisgrid.py:1615: UserWarning: The markers list has more values (3) than needed (2), which may not be intended.
  func(x=x, y=y, **kwargs)
/opt/anaconda3/lib/python3.13/site-packages/seaborn/axisgrid.py:1615: UserWarning: The markers list has more values (3) than needed (2), which may not be intended.
  func(x=x, y=y, **kwargs)
/opt/anaconda3/lib/python3.13/site-packages/seaborn/axisgrid.py:1615: UserWarning: The markers list has more values (3) than needed (2), which may not be intended.
  func(x=x, y=y, **kwargs)
/opt/anaconda3/lib/python3.13/site-packages/seaborn/axisgrid.py:1615: UserWarning: The markers list has more values (3) than needed (2), which may not be intended.
  func(x=x, y=y, **kwargs)
/opt/anaconda3/lib/python3.13/site-packages/seaborn/axisgrid.py:1615: UserWarning: The markers list has more values (3) than needed (2), which may not be intended.
  func(x=x, y=y, **kwargs)
/opt/anaconda3/lib/python3.13/site-packages/seaborn/axisgrid.py:1615: UserWarning: The markers list has more values (3) than needed (2), which may not be intended.
  func(x=x, y=y, **kwargs)
/opt/anaconda3/lib/python3.13/site-packages/seaborn/axisgrid.py:1615: UserWarning: The markers list has more values (3) than needed (2), which may not be intended.
  func(x=x, y=y, **kwargs)
/opt/anaconda3/lib/python3.13/site-packages/seaborn/axisgrid.py:1615: UserWarning: The markers list has more values (3) than needed (2), which may not be intended.
  func(x=x, y=y, **kwargs)
/opt/anaconda3/lib/python3.13/site-packages/seaborn/axisgrid.py:1615: UserWarning: The markers list has more values (3) than needed (2), which may not be intended.
  func(x=x, y=y, **kwargs)
/opt/anaconda3/lib/python3.13/site-packages/seaborn/axisgrid.py:1615: UserWarning: The markers list has more values (3) than needed (2), which may not be intended.
  func(x=x, y=y, **kwargs)
/opt/anaconda3/lib/python3.13/site-packages/seaborn/axisgrid.py:1615: UserWarning: The markers list has more values (3) than needed (2), which may not be intended.
  func(x=x, y=y, **kwargs)
/opt/anaconda3/lib/python3.13/site-packages/seaborn/axisgrid.py:1615: UserWarning: The markers list has more values (3) than needed (2), which may not be intended.
  func(x=x, y=y, **kwargs)
/opt/anaconda3/lib/python3.13/site-packages/seaborn/axisgrid.py:1615: UserWarning: The markers list has more values (3) than needed (2), which may not be intended.
  func(x=x, y=y, **kwargs)
/opt/anaconda3/lib/python3.13/site-packages/seaborn/axisgrid.py:1615: UserWarning: The markers list has more values (3) than needed (2), which may not be intended.
  func(x=x, y=y, **kwargs)
/opt/anaconda3/lib/python3.13/site-packages/seaborn/axisgrid.py:1615: UserWarning: The markers list has more values (3) than needed (2), which may not be intended.
  func(x=x, y=y, **kwargs)
/opt/anaconda3/lib/python3.13/site-packages/seaborn/axisgrid.py:1615: UserWarning: The markers list has more values (3) than needed (2), which may not be intended.
  func(x=x, y=y, **kwargs)
/opt/anaconda3/lib/python3.13/site-packages/seaborn/axisgrid.py:1615: UserWarning: The markers list has more values (3) than needed (2), which may not be intended.
  func(x=x, y=y, **kwargs)
/opt/anaconda3/lib/python3.13/site-packages/seaborn/axisgrid.py:1615: UserWarning: The markers list has more values (3) than needed (2), which may not be intended.
  func(x=x, y=y, **kwargs)
/opt/anaconda3/lib/python3.13/site-packages/seaborn/axisgrid.py:1615: UserWarning: The markers list has more values (3) than needed (2), which may not be intended.
  func(x=x, y=y, **kwargs)
/opt/anaconda3/lib/python3.13/site-packages/seaborn/axisgrid.py:1615: UserWarning: The markers list has more values (3) than needed (2), which may not be intended.
  func(x=x, y=y, **kwargs)
/opt/anaconda3/lib/python3.13/site-packages/seaborn/axisgrid.py:1615: UserWarning: The markers list has more values (3) than needed (2), which may not be intended.
  func(x=x, y=y, **kwargs)
/opt/anaconda3/lib/python3.13/site-packages/seaborn/axisgrid.py:1615: UserWarning: The markers list has more values (3) than needed (2), which may not be intended.
  func(x=x, y=y, **kwargs)
/opt/anaconda3/lib/python3.13/site-packages/seaborn/axisgrid.py:1615: UserWarning: The markers list has more values (3) than needed (2), which may not be intended.
  func(x=x, y=y, **kwargs)
/opt/anaconda3/lib/python3.13/site-packages/seaborn/axisgrid.py:1615: UserWarning: The markers list has more values (3) than needed (2), which may not be intended.
  func(x=x, y=y, **kwargs)
/opt/anaconda3/lib/python3.13/site-packages/seaborn/axisgrid.py:1615: UserWarning: The markers list has more values (3) than needed (2), which may not be intended.
  func(x=x, y=y, **kwargs)
/opt/anaconda3/lib/python3.13/site-packages/seaborn/axisgrid.py:1615: UserWarning: The markers list has more values (3) than needed (2), which may not be intended.
  func(x=x, y=y, **kwargs)
/opt/anaconda3/lib/python3.13/site-packages/seaborn/axisgrid.py:1615: UserWarning: The markers list has more values (3) than needed (2), which may not be intended.
  func(x=x, y=y, **kwargs)
/opt/anaconda3/lib/python3.13/site-packages/seaborn/axisgrid.py:1615: UserWarning: The markers list has more values (3) than needed (2), which may not be intended.
  func(x=x, y=y, **kwargs)
/opt/anaconda3/lib/python3.13/site-packages/seaborn/axisgrid.py:1615: UserWarning: The markers list has more values (3) than needed (2), which may not be intended.
  func(x=x, y=y, **kwargs)
/opt/anaconda3/lib/python3.13/site-packages/seaborn/axisgrid.py:1615: UserWarning: The markers list has more values (3) than needed (2), which may not be intended.
  func(x=x, y=y, **kwargs)
/opt/anaconda3/lib/python3.13/site-packages/seaborn/axisgrid.py:1615: UserWarning: The markers list has more values (3) than needed (2), which may not be intended.
  func(x=x, y=y, **kwargs)
/opt/anaconda3/lib/python3.13/site-packages/seaborn/axisgrid.py:1615: UserWarning: The markers list has more values (3) than needed (2), which may not be intended.
  func(x=x, y=y, **kwargs)
/opt/anaconda3/lib/python3.13/site-packages/seaborn/axisgrid.py:1615: UserWarning: The markers list has more values (3) than needed (2), which may not be intended.
  func(x=x, y=y, **kwargs)
/opt/anaconda3/lib/python3.13/site-packages/seaborn/axisgrid.py:1615: UserWarning: The markers list has more values (3) than needed (2), which may not be intended.
  func(x=x, y=y, **kwargs)
/opt/anaconda3/lib/python3.13/site-packages/seaborn/axisgrid.py:1615: UserWarning: The markers list has more values (3) than needed (2), which may not be intended.
  func(x=x, y=y, **kwargs)
/opt/anaconda3/lib/python3.13/site-packages/seaborn/axisgrid.py:1615: UserWarning: The markers list has more values (3) than needed (2), which may not be intended.
  func(x=x, y=y, **kwargs)
/opt/anaconda3/lib/python3.13/site-packages/seaborn/axisgrid.py:1615: UserWarning: The markers list has more values (3) than needed (2), which may not be intended.
  func(x=x, y=y, **kwargs)
/opt/anaconda3/lib/python3.13/site-packages/seaborn/axisgrid.py:1615: UserWarning: The markers list has more values (3) than needed (2), which may not be intended.
  func(x=x, y=y, **kwargs)
/opt/anaconda3/lib/python3.13/site-packages/seaborn/axisgrid.py:1615: UserWarning: The markers list has more values (3) than needed (2), which may not be intended.
  func(x=x, y=y, **kwargs)
/opt/anaconda3/lib/python3.13/site-packages/seaborn/axisgrid.py:1615: UserWarning: The markers list has more values (3) than needed (2), which may not be intended.
  func(x=x, y=y, **kwargs)
/opt/anaconda3/lib/python3.13/site-packages/seaborn/axisgrid.py:1615: UserWarning: The markers list has more values (3) than needed (2), which may not be intended.
  func(x=x, y=y, **kwargs)
/opt/anaconda3/lib/python3.13/site-packages/seaborn/axisgrid.py:1615: UserWarning: The markers list has more values (3) than needed (2), which may not be intended.
  func(x=x, y=y, **kwargs)
/opt/anaconda3/lib/python3.13/site-packages/seaborn/axisgrid.py:1615: UserWarning: The markers list has more values (3) than needed (2), which may not be intended.
  func(x=x, y=y, **kwargs)
/opt/anaconda3/lib/python3.13/site-packages/seaborn/axisgrid.py:1615: UserWarning: The markers list has more values (3) than needed (2), which may not be intended.
  func(x=x, y=y, **kwargs)
/opt/anaconda3/lib/python3.13/site-packages/seaborn/axisgrid.py:1615: UserWarning: The markers list has more values (3) than needed (2), which may not be intended.
  func(x=x, y=y, **kwargs)
/opt/anaconda3/lib/python3.13/site-packages/seaborn/axisgrid.py:1615: UserWarning: The markers list has more values (3) than needed (2), which may not be intended.
  func(x=x, y=y, **kwargs)
/opt/anaconda3/lib/python3.13/site-packages/seaborn/axisgrid.py:1615: UserWarning: The markers list has more values (3) than needed (2), which may not be intended.
  func(x=x, y=y, **kwargs)
/opt/anaconda3/lib/python3.13/site-packages/seaborn/axisgrid.py:1615: UserWarning: The markers list has more values (3) than needed (2), which may not be intended.
  func(x=x, y=y, **kwargs)
/opt/anaconda3/lib/python3.13/site-packages/seaborn/axisgrid.py:1615: UserWarning: The markers list has more values (3) than needed (2), which may not be intended.
  func(x=x, y=y, **kwargs)
/opt/anaconda3/lib/python3.13/site-packages/seaborn/axisgrid.py:1615: UserWarning: The markers list has more values (3) than needed (2), which may not be intended.
  func(x=x, y=y, **kwargs)
/opt/anaconda3/lib/python3.13/site-packages/seaborn/axisgrid.py:1615: UserWarning: The markers list has more values (3) than needed (2), which may not be intended.
  func(x=x, y=y, **kwargs)
/opt/anaconda3/lib/python3.13/site-packages/seaborn/axisgrid.py:1615: UserWarning: The markers list has more values (3) than needed (2), which may not be intended.
  func(x=x, y=y, **kwargs)
/opt/anaconda3/lib/python3.13/site-packages/seaborn/axisgrid.py:1615: UserWarning: The markers list has more values (3) than needed (2), which may not be intended.
  func(x=x, y=y, **kwargs)
/opt/anaconda3/lib/python3.13/site-packages/seaborn/axisgrid.py:1615: UserWarning: The markers list has more values (3) than needed (2), which may not be intended.
  func(x=x, y=y, **kwargs)
/opt/anaconda3/lib/python3.13/site-packages/seaborn/axisgrid.py:1615: UserWarning: The markers list has more values (3) than needed (2), which may not be intended.
  func(x=x, y=y, **kwargs)
/opt/anaconda3/lib/python3.13/site-packages/seaborn/axisgrid.py:1615: UserWarning: The markers list has more values (3) than needed (2), which may not be intended.
  func(x=x, y=y, **kwargs)
/opt/anaconda3/lib/python3.13/site-packages/seaborn/axisgrid.py:1615: UserWarning: The markers list has more values (3) than needed (2), which may not be intended.
  func(x=x, y=y, **kwargs)
/opt/anaconda3/lib/python3.13/site-packages/seaborn/axisgrid.py:1615: UserWarning: The markers list has more values (3) than needed (2), which may not be intended.
  func(x=x, y=y, **kwargs)
/opt/anaconda3/lib/python3.13/site-packages/seaborn/axisgrid.py:1615: UserWarning: The markers list has more values (3) than needed (2), which may not be intended.
  func(x=x, y=y, **kwargs)
/opt/anaconda3/lib/python3.13/site-packages/seaborn/axisgrid.py:1615: UserWarning: The markers list has more values (3) than needed (2), which may not be intended.
  func(x=x, y=y, **kwargs)
/opt/anaconda3/lib/python3.13/site-packages/seaborn/axisgrid.py:1615: UserWarning: The markers list has more values (3) than needed (2), which may not be intended.
  func(x=x, y=y, **kwargs)
/opt/anaconda3/lib/python3.13/site-packages/seaborn/axisgrid.py:1615: UserWarning: The markers list has more values (3) than needed (2), which may not be intended.
  func(x=x, y=y, **kwargs)
/opt/anaconda3/lib/python3.13/site-packages/seaborn/axisgrid.py:1615: UserWarning: The markers list has more values (3) than needed (2), which may not be intended.
  func(x=x, y=y, **kwargs)
/opt/anaconda3/lib/python3.13/site-packages/seaborn/axisgrid.py:1615: UserWarning: The markers list has more values (3) than needed (2), which may not be intended.
  func(x=x, y=y, **kwargs)
/opt/anaconda3/lib/python3.13/site-packages/seaborn/axisgrid.py:1615: UserWarning: The markers list has more values (3) than needed (2), which may not be intended.
  func(x=x, y=y, **kwargs)
/opt/anaconda3/lib/python3.13/site-packages/seaborn/axisgrid.py:1615: UserWarning: The markers list has more values (3) than needed (2), which may not be intended.
  func(x=x, y=y, **kwargs)
/opt/anaconda3/lib/python3.13/site-packages/seaborn/axisgrid.py:1615: UserWarning: The markers list has more values (3) than needed (2), which may not be intended.
  func(x=x, y=y, **kwargs)
/opt/anaconda3/lib/python3.13/site-packages/seaborn/axisgrid.py:1615: UserWarning: The markers list has more values (3) than needed (2), which may not be intended.
  func(x=x, y=y, **kwargs)

# Visualize the distribution of each feature using histograms.
plt.figure(figsize =( 12, 6))
for i, feature in enumerate(X[:-1]):
    plt.subplot(2, 2, i + 1)
    sns.histplot(data = df2, x = feature, hue = 'descripcion_grupo_enfermedad_num', kde = True)
    plt.title(f'{feature} Distribution.')

plt.tight_layout()
plt.show()

correlation_matrix = df2.corr(numeric_only = True)
plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', linewidths=0.5)
plt.title("Correlation Heatmap")
plt.show()

# Verificación de n_components = 1 siend 2 clases.
print(np.unique(y_train))
print(len(np.unique(y_train)))

[0 1]
2

# Apply Linear Discriminant Analysis.
lda = LinearDiscriminantAnalysis(n_components = 1)
X_train_lda = lda.fit_transform(X_train_smote, y_train_smote)
X_test_lda = lda.transform(X_test)

tmp_Df = pd.DataFrame(X_train_lda, columns = ['LDA Component 1'])
tmp_Df['Class'] = y_train_smote.values  

# Grafica.
sns.histplot(data = tmp_Df, x = 'LDA Component 1', hue = 'Class', kde = True)
plt.show()

print(tmp_Df.head())

   LDA Component 1  Class
0        -0.571684      0
1        -1.299334      0
2         0.177260      1
3        -1.449144      1
4         1.296899      1

from sklearn.tree import DecisionTreeClassifier as DTC
tree_smote = DTC().fit(X_train_smote, y_train_smote)

from sklearn.tree import plot_tree
plt.figure(figsize=(15,10))
plot_tree(tree_smote, filled = True, feature_names = X_train_smote.columns)

[Text(0.4680059523809524, 0.96875, 'institucion_unidad_medica_num <= 0.5\ngini = 0.5\nsamples = 876\nvalue = [438, 438]'),
 Text(0.2222222222222222, 0.90625, 'sexo_num <= 0.5\ngini = 0.38\nsamples = 381\nvalue = [97, 284]'),
 Text(0.3451140873015873, 0.9375, 'True  '),
 Text(0.09523809523809523, 0.84375, 'edad <= 21.5\ngini = 0.484\nsamples = 205\nvalue = [84, 121]'),
 Text(0.07936507936507936, 0.78125, 'gini = 0.0\nsamples = 17\nvalue = [0, 17]'),
 Text(0.1111111111111111, 0.78125, 'edad <= 49.5\ngini = 0.494\nsamples = 188\nvalue = [84, 104]'),
 Text(0.047619047619047616, 0.71875, 'edad <= 24.5\ngini = 0.498\nsamples = 136\nvalue = [72, 64]'),
 Text(0.031746031746031744, 0.65625, 'gini = 0.0\nsamples = 7\nvalue = [7, 0]'),
 Text(0.06349206349206349, 0.65625, 'edad <= 27.5\ngini = 0.5\nsamples = 129\nvalue = [65, 64]'),
 Text(0.031746031746031744, 0.59375, 'edad <= 25.5\ngini = 0.172\nsamples = 21\nvalue = [2, 19]'),
 Text(0.015873015873015872, 0.53125, 'gini = 0.408\nsamples = 7\nvalue = [2, 5]'),
 Text(0.047619047619047616, 0.53125, 'gini = 0.0\nsamples = 14\nvalue = [0, 14]'),
 Text(0.09523809523809523, 0.59375, 'edad <= 29.0\ngini = 0.486\nsamples = 108\nvalue = [63.0, 45.0]'),
 Text(0.07936507936507936, 0.53125, 'gini = 0.255\nsamples = 20\nvalue = [17, 3]'),
 Text(0.1111111111111111, 0.53125, 'edad <= 34.0\ngini = 0.499\nsamples = 88\nvalue = [46, 42]'),
 Text(0.07142857142857142, 0.46875, 'edad <= 32.5\ngini = 0.32\nsamples = 10\nvalue = [2, 8]'),
 Text(0.05555555555555555, 0.40625, 'edad <= 31.0\ngini = 0.5\nsamples = 4\nvalue = [2, 2]'),
 Text(0.03968253968253968, 0.34375, 'gini = 0.0\nsamples = 2\nvalue = [0, 2]'),
 Text(0.07142857142857142, 0.34375, 'gini = 0.0\nsamples = 2\nvalue = [2, 0]'),
 Text(0.0873015873015873, 0.40625, 'gini = 0.0\nsamples = 6\nvalue = [0, 6]'),
 Text(0.15079365079365079, 0.46875, 'edad <= 42.5\ngini = 0.492\nsamples = 78\nvalue = [44, 34]'),
 Text(0.11904761904761904, 0.40625, 'edad <= 41.5\ngini = 0.401\nsamples = 36\nvalue = [26, 10]'),
 Text(0.10317460317460317, 0.34375, 'edad <= 36.0\ngini = 0.473\nsamples = 26\nvalue = [16, 10]'),
 Text(0.0873015873015873, 0.28125, 'gini = 0.444\nsamples = 18\nvalue = [12, 6]'),
 Text(0.11904761904761904, 0.28125, 'edad <= 38.5\ngini = 0.5\nsamples = 8\nvalue = [4, 4]'),
 Text(0.10317460317460317, 0.21875, 'gini = 0.0\nsamples = 2\nvalue = [0, 2]'),
 Text(0.1349206349206349, 0.21875, 'edad <= 40.5\ngini = 0.444\nsamples = 6\nvalue = [4, 2]'),
 Text(0.11904761904761904, 0.15625, 'gini = 0.0\nsamples = 2\nvalue = [2, 0]'),
 Text(0.15079365079365079, 0.15625, 'gini = 0.5\nsamples = 4\nvalue = [2, 2]'),
 Text(0.1349206349206349, 0.34375, 'gini = 0.0\nsamples = 10\nvalue = [10, 0]'),
 Text(0.18253968253968253, 0.40625, 'edad <= 45.0\ngini = 0.49\nsamples = 42\nvalue = [18, 24]'),
 Text(0.16666666666666666, 0.34375, 'gini = 0.0\nsamples = 12\nvalue = [0, 12]'),
 Text(0.1984126984126984, 0.34375, 'edad <= 48.5\ngini = 0.48\nsamples = 30\nvalue = [18, 12]'),
 Text(0.18253968253968253, 0.28125, 'edad <= 46.5\ngini = 0.494\nsamples = 27\nvalue = [15, 12]'),
 Text(0.16666666666666666, 0.21875, 'gini = 0.375\nsamples = 4\nvalue = [3, 1]'),
 Text(0.1984126984126984, 0.21875, 'edad <= 47.5\ngini = 0.499\nsamples = 23\nvalue = [12, 11]'),
 Text(0.18253968253968253, 0.15625, 'gini = 0.498\nsamples = 17\nvalue = [8, 9]'),
 Text(0.21428571428571427, 0.15625, 'gini = 0.444\nsamples = 6\nvalue = [4, 2]'),
 Text(0.21428571428571427, 0.28125, 'gini = 0.0\nsamples = 3\nvalue = [3, 0]'),
 Text(0.1746031746031746, 0.71875, 'edad <= 53.5\ngini = 0.355\nsamples = 52\nvalue = [12, 40]'),
 Text(0.14285714285714285, 0.65625, 'edad <= 50.5\ngini = 0.083\nsamples = 23\nvalue = [1, 22]'),
 Text(0.12698412698412698, 0.59375, 'gini = 0.219\nsamples = 8\nvalue = [1, 7]'),
 Text(0.15873015873015872, 0.59375, 'gini = 0.0\nsamples = 15\nvalue = [0, 15]'),
 Text(0.20634920634920634, 0.65625, 'edad <= 54.5\ngini = 0.471\nsamples = 29\nvalue = [11, 18]'),
 Text(0.19047619047619047, 0.59375, 'gini = 0.298\nsamples = 11\nvalue = [9, 2]'),
 Text(0.2222222222222222, 0.59375, 'edad <= 58.5\ngini = 0.198\nsamples = 18\nvalue = [2, 16]'),
 Text(0.20634920634920634, 0.53125, 'gini = 0.0\nsamples = 13\nvalue = [0, 13]'),
 Text(0.23809523809523808, 0.53125, 'edad <= 60.5\ngini = 0.48\nsamples = 5\nvalue = [2, 3]'),
 Text(0.2222222222222222, 0.46875, 'gini = 0.0\nsamples = 2\nvalue = [2, 0]'),
 Text(0.25396825396825395, 0.46875, 'gini = 0.0\nsamples = 3\nvalue = [0, 3]'),
 Text(0.3492063492063492, 0.84375, 'edad <= 46.0\ngini = 0.137\nsamples = 176\nvalue = [13, 163]'),
 Text(0.31746031746031744, 0.78125, 'edad <= 32.5\ngini = 0.058\nsamples = 133\nvalue = [4, 129]'),
 Text(0.30158730158730157, 0.71875, 'edad <= 21.5\ngini = 0.081\nsamples = 95\nvalue = [4, 91]'),
 Text(0.2857142857142857, 0.65625, 'gini = 0.0\nsamples = 26\nvalue = [0, 26]'),
 Text(0.31746031746031744, 0.65625, 'edad <= 23.0\ngini = 0.109\nsamples = 69\nvalue = [4, 65]'),
 Text(0.30158730158730157, 0.59375, 'gini = 0.5\nsamples = 2\nvalue = [1, 1]'),
 Text(0.3333333333333333, 0.59375, 'edad <= 31.5\ngini = 0.086\nsamples = 67\nvalue = [3, 64]'),
 Text(0.31746031746031744, 0.53125, 'edad <= 25.5\ngini = 0.065\nsamples = 59\nvalue = [2, 57]'),
 Text(0.2857142857142857, 0.46875, 'edad <= 24.5\ngini = 0.198\nsamples = 9\nvalue = [1, 8]'),
 Text(0.2698412698412698, 0.40625, 'gini = 0.0\nsamples = 3\nvalue = [0, 3]'),
 Text(0.30158730158730157, 0.40625, 'gini = 0.278\nsamples = 6\nvalue = [1, 5]'),
 Text(0.3492063492063492, 0.46875, 'edad <= 28.5\ngini = 0.039\nsamples = 50\nvalue = [1, 49]'),
 Text(0.3333333333333333, 0.40625, 'edad <= 27.5\ngini = 0.071\nsamples = 27\nvalue = [1, 26]'),
 Text(0.31746031746031744, 0.34375, 'gini = 0.0\nsamples = 7\nvalue = [0, 7]'),
 Text(0.3492063492063492, 0.34375, 'gini = 0.095\nsamples = 20\nvalue = [1, 19]'),
 Text(0.36507936507936506, 0.40625, 'gini = 0.0\nsamples = 23\nvalue = [0, 23]'),
 Text(0.3492063492063492, 0.53125, 'gini = 0.219\nsamples = 8\nvalue = [1, 7]'),
 Text(0.3333333333333333, 0.71875, 'gini = 0.0\nsamples = 38\nvalue = [0, 38]'),
 Text(0.38095238095238093, 0.78125, 'edad <= 48.0\ngini = 0.331\nsamples = 43\nvalue = [9, 34]'),
 Text(0.36507936507936506, 0.71875, 'gini = 0.459\nsamples = 14\nvalue = [9, 5]'),
 Text(0.3968253968253968, 0.71875, 'gini = 0.0\nsamples = 29\nvalue = [0, 29]'),
 Text(0.7137896825396826, 0.90625, 'edad <= 42.5\ngini = 0.429\nsamples = 495\nvalue = [341, 154]'),
 Text(0.5908978174603174, 0.9375, '  False'),
 Text(0.5466269841269841, 0.84375, 'sexo_num <= 0.5\ngini = 0.493\nsamples = 232\nvalue = [130, 102]'),
 Text(0.4742063492063492, 0.78125, 'edad <= 19.5\ngini = 0.468\nsamples = 203\nvalue = [127, 76]'),
 Text(0.42857142857142855, 0.71875, 'edad <= 18.5\ngini = 0.124\nsamples = 15\nvalue = [14, 1]'),
 Text(0.4126984126984127, 0.65625, 'gini = 0.18\nsamples = 10\nvalue = [9, 1]'),
 Text(0.4444444444444444, 0.65625, 'gini = 0.0\nsamples = 5\nvalue = [5, 0]'),
 Text(0.5198412698412699, 0.71875, 'edad <= 21.5\ngini = 0.48\nsamples = 188\nvalue = [113, 75]'),
 Text(0.47619047619047616, 0.65625, 'edad <= 20.5\ngini = 0.188\nsamples = 19\nvalue = [2, 17]'),
 Text(0.4603174603174603, 0.59375, 'gini = 0.245\nsamples = 7\nvalue = [1, 6]'),
 Text(0.49206349206349204, 0.59375, 'gini = 0.153\nsamples = 12\nvalue = [1, 11]'),
 Text(0.5634920634920635, 0.65625, 'edad <= 23.5\ngini = 0.451\nsamples = 169\nvalue = [111.0, 58.0]'),
 Text(0.5238095238095238, 0.59375, 'edad <= 22.5\ngini = 0.26\nsamples = 39\nvalue = [33, 6]'),
 Text(0.5079365079365079, 0.53125, 'gini = 0.238\nsamples = 29\nvalue = [25, 4]'),
 Text(0.5396825396825397, 0.53125, 'gini = 0.32\nsamples = 10\nvalue = [8, 2]'),
 Text(0.6031746031746031, 0.59375, 'edad <= 25.5\ngini = 0.48\nsamples = 130\nvalue = [78, 52]'),
 Text(0.5714285714285714, 0.53125, 'edad <= 24.5\ngini = 0.473\nsamples = 26\nvalue = [10, 16]'),
 Text(0.5555555555555556, 0.46875, 'gini = 0.486\nsamples = 12\nvalue = [5, 7]'),
 Text(0.5873015873015873, 0.46875, 'gini = 0.459\nsamples = 14\nvalue = [5, 9]'),
 Text(0.6349206349206349, 0.53125, 'edad <= 28.5\ngini = 0.453\nsamples = 104\nvalue = [68, 36]'),
 Text(0.6190476190476191, 0.46875, 'gini = 0.0\nsamples = 12\nvalue = [12, 0]'),
 Text(0.6507936507936508, 0.46875, 'edad <= 41.5\ngini = 0.476\nsamples = 92\nvalue = [56, 36]'),
 Text(0.6349206349206349, 0.40625, 'edad <= 40.5\ngini = 0.472\nsamples = 89\nvalue = [55.0, 34.0]'),
 Text(0.6190476190476191, 0.34375, 'edad <= 36.5\ngini = 0.479\nsamples = 83\nvalue = [50.0, 33.0]'),
 Text(0.5793650793650794, 0.28125, 'edad <= 31.5\ngini = 0.46\nsamples = 64\nvalue = [41, 23]'),
 Text(0.5476190476190477, 0.21875, 'edad <= 29.5\ngini = 0.48\nsamples = 5\nvalue = [2, 3]'),
 Text(0.5317460317460317, 0.15625, 'gini = 0.444\nsamples = 3\nvalue = [2, 1]'),
 Text(0.5634920634920635, 0.15625, 'gini = 0.0\nsamples = 2\nvalue = [0, 2]'),
 Text(0.6111111111111112, 0.21875, 'edad <= 32.5\ngini = 0.448\nsamples = 59\nvalue = [39, 20]'),
 Text(0.5952380952380952, 0.15625, 'gini = 0.32\nsamples = 5\nvalue = [4, 1]'),
 Text(0.626984126984127, 0.15625, 'edad <= 34.5\ngini = 0.456\nsamples = 54\nvalue = [35, 19]'),
 Text(0.5952380952380952, 0.09375, 'edad <= 33.5\ngini = 0.471\nsamples = 29\nvalue = [18, 11]'),
 Text(0.5793650793650794, 0.03125, 'gini = 0.459\nsamples = 28\nvalue = [18, 10]'),
 Text(0.6111111111111112, 0.03125, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]'),
 Text(0.6587301587301587, 0.09375, 'edad <= 35.5\ngini = 0.435\nsamples = 25\nvalue = [17, 8]'),
 Text(0.6428571428571429, 0.03125, 'gini = 0.278\nsamples = 6\nvalue = [5, 1]'),
 Text(0.6746031746031746, 0.03125, 'gini = 0.465\nsamples = 19\nvalue = [12, 7]'),
 Text(0.6587301587301587, 0.28125, 'edad <= 38.5\ngini = 0.499\nsamples = 19\nvalue = [9, 10]'),
 Text(0.6428571428571429, 0.21875, 'gini = 0.463\nsamples = 11\nvalue = [4, 7]'),
 Text(0.6746031746031746, 0.21875, 'gini = 0.469\nsamples = 8\nvalue = [5, 3]'),
 Text(0.6507936507936508, 0.34375, 'gini = 0.278\nsamples = 6\nvalue = [5, 1]'),
 Text(0.6666666666666666, 0.40625, 'gini = 0.444\nsamples = 3\nvalue = [1, 2]'),
 Text(0.6190476190476191, 0.78125, 'edad <= 19.0\ngini = 0.185\nsamples = 29\nvalue = [3, 26]'),
 Text(0.6031746031746031, 0.71875, 'gini = 0.0\nsamples = 2\nvalue = [2, 0]'),
 Text(0.6349206349206349, 0.71875, 'edad <= 32.0\ngini = 0.071\nsamples = 27\nvalue = [1, 26]'),
 Text(0.6190476190476191, 0.65625, 'gini = 0.0\nsamples = 17\nvalue = [0, 17]'),
 Text(0.6507936507936508, 0.65625, 'edad <= 34.0\ngini = 0.18\nsamples = 10\nvalue = [1, 9]'),
 Text(0.6349206349206349, 0.59375, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]'),
 Text(0.6666666666666666, 0.59375, 'gini = 0.0\nsamples = 9\nvalue = [0, 9]'),
 Text(0.8809523809523809, 0.84375, 'edad <= 56.5\ngini = 0.317\nsamples = 263\nvalue = [211, 52]'),
 Text(0.8253968253968254, 0.78125, 'edad <= 52.5\ngini = 0.249\nsamples = 206\nvalue = [176.0, 30.0]'),
 Text(0.7936507936507936, 0.71875, 'sexo_num <= 0.5\ngini = 0.363\nsamples = 88\nvalue = [67, 21]'),
 Text(0.7777777777777778, 0.65625, 'edad <= 48.5\ngini = 0.458\nsamples = 59\nvalue = [38, 21]'),
 Text(0.746031746031746, 0.59375, 'edad <= 47.0\ngini = 0.397\nsamples = 44\nvalue = [32, 12]'),
 Text(0.7301587301587301, 0.53125, 'edad <= 45.5\ngini = 0.495\nsamples = 20\nvalue = [11, 9]'),
 Text(0.7142857142857143, 0.46875, 'edad <= 43.5\ngini = 0.444\nsamples = 15\nvalue = [10, 5]'),
 Text(0.6984126984126984, 0.40625, 'gini = 0.375\nsamples = 4\nvalue = [1, 3]'),
 Text(0.7301587301587301, 0.40625, 'edad <= 44.5\ngini = 0.298\nsamples = 11\nvalue = [9, 2]'),
 Text(0.7142857142857143, 0.34375, 'gini = 0.444\nsamples = 6\nvalue = [4, 2]'),
 Text(0.746031746031746, 0.34375, 'gini = 0.0\nsamples = 5\nvalue = [5, 0]'),
 Text(0.746031746031746, 0.46875, 'gini = 0.32\nsamples = 5\nvalue = [1, 4]'),
 Text(0.7619047619047619, 0.53125, 'gini = 0.219\nsamples = 24\nvalue = [21, 3]'),
 Text(0.8095238095238095, 0.59375, 'edad <= 51.0\ngini = 0.48\nsamples = 15\nvalue = [6, 9]'),
 Text(0.7936507936507936, 0.53125, 'edad <= 49.5\ngini = 0.219\nsamples = 8\nvalue = [1, 7]'),
 Text(0.7777777777777778, 0.46875, 'gini = 0.278\nsamples = 6\nvalue = [1, 5]'),
 Text(0.8095238095238095, 0.46875, 'gini = 0.0\nsamples = 2\nvalue = [0, 2]'),
 Text(0.8253968253968254, 0.53125, 'gini = 0.408\nsamples = 7\nvalue = [5, 2]'),
 Text(0.8095238095238095, 0.65625, 'gini = 0.0\nsamples = 29\nvalue = [29, 0]'),
 Text(0.8571428571428571, 0.71875, 'edad <= 53.5\ngini = 0.141\nsamples = 118\nvalue = [109, 9]'),
 Text(0.8412698412698413, 0.65625, 'gini = 0.0\nsamples = 22\nvalue = [22, 0]'),
 Text(0.873015873015873, 0.65625, 'edad <= 54.5\ngini = 0.17\nsamples = 96\nvalue = [87, 9]'),
 Text(0.8571428571428571, 0.59375, 'gini = 0.208\nsamples = 34\nvalue = [30, 4]'),
 Text(0.8888888888888888, 0.59375, 'edad <= 55.5\ngini = 0.148\nsamples = 62\nvalue = [57, 5]'),
 Text(0.873015873015873, 0.53125, 'gini = 0.08\nsamples = 24\nvalue = [23, 1]'),
 Text(0.9047619047619048, 0.53125, 'gini = 0.188\nsamples = 38\nvalue = [34, 4]'),
 Text(0.9365079365079365, 0.78125, 'edad <= 58.0\ngini = 0.474\nsamples = 57\nvalue = [35, 22]'),
 Text(0.9206349206349206, 0.71875, 'gini = 0.332\nsamples = 19\nvalue = [4, 15]'),
 Text(0.9523809523809523, 0.71875, 'edad <= 61.5\ngini = 0.301\nsamples = 38\nvalue = [31, 7]'),
 Text(0.9365079365079365, 0.65625, 'gini = 0.0\nsamples = 23\nvalue = [23, 0]'),
 Text(0.9682539682539683, 0.65625, 'edad <= 64.5\ngini = 0.498\nsamples = 15\nvalue = [8, 7]'),
 Text(0.9523809523809523, 0.59375, 'edad <= 63.5\ngini = 0.497\nsamples = 13\nvalue = [6, 7]'),
 Text(0.9365079365079365, 0.53125, 'edad <= 62.5\ngini = 0.494\nsamples = 9\nvalue = [5, 4]'),
 Text(0.9206349206349206, 0.46875, 'gini = 0.49\nsamples = 7\nvalue = [3, 4]'),
 Text(0.9523809523809523, 0.46875, 'gini = 0.0\nsamples = 2\nvalue = [2, 0]'),
 Text(0.9682539682539683, 0.53125, 'gini = 0.375\nsamples = 4\nvalue = [1, 3]'),
 Text(0.9841269841269841, 0.59375, 'gini = 0.0\nsamples = 2\nvalue = [2, 0]')]

print(tree_smote.tree_.max_depth)

15

print(tree_smote.get_n_leaves())

78

from sklearn.metrics import accuracy_score, f1_score
yhat0 = tree_smote.predict(X_test)
acc0 = accuracy_score(y_test, yhat0)
f10 = f1_score(y_test, yhat0, average='weighted')
print("Accuracy inicial: ", acc0)
print("F1-score inicial: ", f10)

Accuracy inicial:  0.8125
F1-score inicial:  0.8179125376992675

# Matriz de confusión - Árbol inicial.
from sklearn.metrics import confusion_matrix

conf_m = confusion_matrix(y_test, yhat0)
sns.heatmap(conf_m, annot = True, fmt = "d", cmap = "Blues", cbar = False, square = True)
plt.ylabel('y_true')
plt.xlabel('y_pred')
plt.title('Matriz de Confusión - Árbol Inicial.')
plt.show()

from sklearn.model_selection import cross_val_score, StratifiedKFold
skf = StratifiedKFold(n_splits = 4)
ccp = np.linspace(0.001, 0.2, 250)
cv_scores = []
for alpha in ccp:
    pruned_tree = DTC(ccp_alpha = alpha, class_weight='balanced')
    cv_scores.append(np.mean(cross_val_score(pruned_tree, X_train_smote, y_train_smote, cv = skf, scoring = 'f1')))

alpha = ccp[np.argmax(cv_scores)]
print("Best alpha: ", alpha)

Best alpha:  0.001

pruned_tree = DTC(ccp_alpha = alpha).fit(X_train_smote, y_train_smote)
plot_tree(pruned_tree, filled = True, feature_names = X_train_smote.columns)

[Text(0.4253472222222222, 0.9583333333333334, 'institucion_unidad_medica_num <= 0.5\ngini = 0.5\nsamples = 876\nvalue = [438, 438]'),
 Text(0.19791666666666666, 0.875, 'sexo_num <= 0.5\ngini = 0.38\nsamples = 381\nvalue = [97, 284]'),
 Text(0.3116319444444444, 0.9166666666666667, 'True  '),
 Text(0.11805555555555555, 0.7916666666666666, 'edad <= 21.5\ngini = 0.484\nsamples = 205\nvalue = [84, 121]'),
 Text(0.09027777777777778, 0.7083333333333334, 'gini = 0.0\nsamples = 17\nvalue = [0, 17]'),
 Text(0.14583333333333334, 0.7083333333333334, 'edad <= 49.5\ngini = 0.494\nsamples = 188\nvalue = [84, 104]'),
 Text(0.06944444444444445, 0.625, 'edad <= 24.5\ngini = 0.498\nsamples = 136\nvalue = [72, 64]'),
 Text(0.041666666666666664, 0.5416666666666666, 'gini = 0.0\nsamples = 7\nvalue = [7, 0]'),
 Text(0.09722222222222222, 0.5416666666666666, 'edad <= 27.5\ngini = 0.5\nsamples = 129\nvalue = [65, 64]'),
 Text(0.06944444444444445, 0.4583333333333333, 'gini = 0.172\nsamples = 21\nvalue = [2, 19]'),
 Text(0.125, 0.4583333333333333, 'edad <= 29.0\ngini = 0.486\nsamples = 108\nvalue = [63.0, 45.0]'),
 Text(0.09722222222222222, 0.375, 'gini = 0.255\nsamples = 20\nvalue = [17, 3]'),
 Text(0.1527777777777778, 0.375, 'edad <= 34.0\ngini = 0.499\nsamples = 88\nvalue = [46, 42]'),
 Text(0.08333333333333333, 0.2916666666666667, 'edad <= 32.5\ngini = 0.32\nsamples = 10\nvalue = [2, 8]'),
 Text(0.05555555555555555, 0.20833333333333334, 'edad <= 31.0\ngini = 0.5\nsamples = 4\nvalue = [2, 2]'),
 Text(0.027777777777777776, 0.125, 'gini = 0.0\nsamples = 2\nvalue = [0, 2]'),
 Text(0.08333333333333333, 0.125, 'gini = 0.0\nsamples = 2\nvalue = [2, 0]'),
 Text(0.1111111111111111, 0.20833333333333334, 'gini = 0.0\nsamples = 6\nvalue = [0, 6]'),
 Text(0.2222222222222222, 0.2916666666666667, 'edad <= 42.5\ngini = 0.492\nsamples = 78\nvalue = [44, 34]'),
 Text(0.16666666666666666, 0.20833333333333334, 'edad <= 41.5\ngini = 0.401\nsamples = 36\nvalue = [26, 10]'),
 Text(0.1388888888888889, 0.125, 'gini = 0.473\nsamples = 26\nvalue = [16, 10]'),
 Text(0.19444444444444445, 0.125, 'gini = 0.0\nsamples = 10\nvalue = [10, 0]'),
 Text(0.2777777777777778, 0.20833333333333334, 'edad <= 45.0\ngini = 0.49\nsamples = 42\nvalue = [18, 24]'),
 Text(0.25, 0.125, 'gini = 0.0\nsamples = 12\nvalue = [0, 12]'),
 Text(0.3055555555555556, 0.125, 'edad <= 48.5\ngini = 0.48\nsamples = 30\nvalue = [18, 12]'),
 Text(0.2777777777777778, 0.041666666666666664, 'gini = 0.494\nsamples = 27\nvalue = [15, 12]'),
 Text(0.3333333333333333, 0.041666666666666664, 'gini = 0.0\nsamples = 3\nvalue = [3, 0]'),
 Text(0.2222222222222222, 0.625, 'edad <= 53.5\ngini = 0.355\nsamples = 52\nvalue = [12, 40]'),
 Text(0.19444444444444445, 0.5416666666666666, 'gini = 0.083\nsamples = 23\nvalue = [1, 22]'),
 Text(0.25, 0.5416666666666666, 'edad <= 54.5\ngini = 0.471\nsamples = 29\nvalue = [11, 18]'),
 Text(0.2222222222222222, 0.4583333333333333, 'gini = 0.298\nsamples = 11\nvalue = [9, 2]'),
 Text(0.2777777777777778, 0.4583333333333333, 'edad <= 58.5\ngini = 0.198\nsamples = 18\nvalue = [2, 16]'),
 Text(0.25, 0.375, 'gini = 0.0\nsamples = 13\nvalue = [0, 13]'),
 Text(0.3055555555555556, 0.375, 'edad <= 60.5\ngini = 0.48\nsamples = 5\nvalue = [2, 3]'),
 Text(0.2777777777777778, 0.2916666666666667, 'gini = 0.0\nsamples = 2\nvalue = [2, 0]'),
 Text(0.3333333333333333, 0.2916666666666667, 'gini = 0.0\nsamples = 3\nvalue = [0, 3]'),
 Text(0.2777777777777778, 0.7916666666666666, 'edad <= 46.0\ngini = 0.137\nsamples = 176\nvalue = [13, 163]'),
 Text(0.25, 0.7083333333333334, 'gini = 0.058\nsamples = 133\nvalue = [4, 129]'),
 Text(0.3055555555555556, 0.7083333333333334, 'edad <= 48.0\ngini = 0.331\nsamples = 43\nvalue = [9, 34]'),
 Text(0.2777777777777778, 0.625, 'gini = 0.459\nsamples = 14\nvalue = [9, 5]'),
 Text(0.3333333333333333, 0.625, 'gini = 0.0\nsamples = 29\nvalue = [0, 29]'),
 Text(0.6527777777777778, 0.875, 'edad <= 42.5\ngini = 0.429\nsamples = 495\nvalue = [341, 154]'),
 Text(0.5390625, 0.9166666666666667, '  False'),
 Text(0.4722222222222222, 0.7916666666666666, 'sexo_num <= 0.5\ngini = 0.493\nsamples = 232\nvalue = [130, 102]'),
 Text(0.4166666666666667, 0.7083333333333334, 'edad <= 19.5\ngini = 0.468\nsamples = 203\nvalue = [127, 76]'),
 Text(0.3888888888888889, 0.625, 'gini = 0.124\nsamples = 15\nvalue = [14, 1]'),
 Text(0.4444444444444444, 0.625, 'edad <= 21.5\ngini = 0.48\nsamples = 188\nvalue = [113, 75]'),
 Text(0.4166666666666667, 0.5416666666666666, 'gini = 0.188\nsamples = 19\nvalue = [2, 17]'),
 Text(0.4722222222222222, 0.5416666666666666, 'edad <= 23.5\ngini = 0.451\nsamples = 169\nvalue = [111.0, 58.0]'),
 Text(0.4444444444444444, 0.4583333333333333, 'gini = 0.26\nsamples = 39\nvalue = [33, 6]'),
 Text(0.5, 0.4583333333333333, 'edad <= 25.5\ngini = 0.48\nsamples = 130\nvalue = [78, 52]'),
 Text(0.4722222222222222, 0.375, 'gini = 0.473\nsamples = 26\nvalue = [10, 16]'),
 Text(0.5277777777777778, 0.375, 'edad <= 28.5\ngini = 0.453\nsamples = 104\nvalue = [68, 36]'),
 Text(0.5, 0.2916666666666667, 'gini = 0.0\nsamples = 12\nvalue = [12, 0]'),
 Text(0.5555555555555556, 0.2916666666666667, 'gini = 0.476\nsamples = 92\nvalue = [56, 36]'),
 Text(0.5277777777777778, 0.7083333333333334, 'edad <= 19.0\ngini = 0.185\nsamples = 29\nvalue = [3, 26]'),
 Text(0.5, 0.625, 'gini = 0.0\nsamples = 2\nvalue = [2, 0]'),
 Text(0.5555555555555556, 0.625, 'edad <= 32.0\ngini = 0.071\nsamples = 27\nvalue = [1, 26]'),
 Text(0.5277777777777778, 0.5416666666666666, 'gini = 0.0\nsamples = 17\nvalue = [0, 17]'),
 Text(0.5833333333333334, 0.5416666666666666, 'edad <= 34.0\ngini = 0.18\nsamples = 10\nvalue = [1, 9]'),
 Text(0.5555555555555556, 0.4583333333333333, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]'),
 Text(0.6111111111111112, 0.4583333333333333, 'gini = 0.0\nsamples = 9\nvalue = [0, 9]'),
 Text(0.8333333333333334, 0.7916666666666666, 'edad <= 56.5\ngini = 0.317\nsamples = 263\nvalue = [211, 52]'),
 Text(0.7777777777777778, 0.7083333333333334, 'edad <= 52.5\ngini = 0.249\nsamples = 206\nvalue = [176.0, 30.0]'),
 Text(0.75, 0.625, 'sexo_num <= 0.5\ngini = 0.363\nsamples = 88\nvalue = [67, 21]'),
 Text(0.7222222222222222, 0.5416666666666666, 'edad <= 48.5\ngini = 0.458\nsamples = 59\nvalue = [38, 21]'),
 Text(0.6666666666666666, 0.4583333333333333, 'edad <= 47.0\ngini = 0.397\nsamples = 44\nvalue = [32, 12]'),
 Text(0.6388888888888888, 0.375, 'edad <= 45.5\ngini = 0.495\nsamples = 20\nvalue = [11, 9]'),
 Text(0.6111111111111112, 0.2916666666666667, 'edad <= 43.5\ngini = 0.444\nsamples = 15\nvalue = [10, 5]'),
 Text(0.5833333333333334, 0.20833333333333334, 'gini = 0.375\nsamples = 4\nvalue = [1, 3]'),
 Text(0.6388888888888888, 0.20833333333333334, 'gini = 0.298\nsamples = 11\nvalue = [9, 2]'),
 Text(0.6666666666666666, 0.2916666666666667, 'gini = 0.32\nsamples = 5\nvalue = [1, 4]'),
 Text(0.6944444444444444, 0.375, 'gini = 0.219\nsamples = 24\nvalue = [21, 3]'),
 Text(0.7777777777777778, 0.4583333333333333, 'edad <= 51.0\ngini = 0.48\nsamples = 15\nvalue = [6, 9]'),
 Text(0.75, 0.375, 'gini = 0.219\nsamples = 8\nvalue = [1, 7]'),
 Text(0.8055555555555556, 0.375, 'gini = 0.408\nsamples = 7\nvalue = [5, 2]'),
 Text(0.7777777777777778, 0.5416666666666666, 'gini = 0.0\nsamples = 29\nvalue = [29, 0]'),
 Text(0.8055555555555556, 0.625, 'gini = 0.141\nsamples = 118\nvalue = [109, 9]'),
 Text(0.8888888888888888, 0.7083333333333334, 'edad <= 58.0\ngini = 0.474\nsamples = 57\nvalue = [35, 22]'),
 Text(0.8611111111111112, 0.625, 'gini = 0.332\nsamples = 19\nvalue = [4, 15]'),
 Text(0.9166666666666666, 0.625, 'edad <= 61.5\ngini = 0.301\nsamples = 38\nvalue = [31, 7]'),
 Text(0.8888888888888888, 0.5416666666666666, 'gini = 0.0\nsamples = 23\nvalue = [23, 0]'),
 Text(0.9444444444444444, 0.5416666666666666, 'edad <= 64.5\ngini = 0.498\nsamples = 15\nvalue = [8, 7]'),
 Text(0.9166666666666666, 0.4583333333333333, 'gini = 0.497\nsamples = 13\nvalue = [6, 7]'),
 Text(0.9722222222222222, 0.4583333333333333, 'gini = 0.0\nsamples = 2\nvalue = [2, 0]')]

print(pruned_tree.tree_.max_depth)

11

print(pruned_tree.get_n_leaves())

42

yhat_p = pruned_tree.predict(X_test)
acc_p = accuracy_score(y_test, yhat_p)
f1_p = f1_score(y_test, yhat_p, average = 'weighted')
print("Accuracy final: ", acc_p)
print("F1-score final: ", f1_p)

Accuracy final:  0.7916666666666666
F1-score final:  0.8018790849673203

# Matriz de confusión - Árbol inicial
from sklearn.metrics import confusion_matrix

conf_m = confusion_matrix(y_test, yhat_p)
sns.heatmap(conf_m, annot=True, fmt="d", cmap="Blues", cbar=False, square=True)
plt.ylabel('y_true')
plt.xlabel('y_pred')
plt.title('Matriz de Confusión - Árbol Podado.')
plt.show()

tmp_Df = pd.DataFrame(X_train_lda, columns = ['LDA Component 1'])
tmp_Df['Class'] = y_train_smote.values  

# Grafica.
sns.histplot(data = tmp_Df, x = 'LDA Component 1', hue = 'Class', kde = True)
plt.show()

modelos = ['Árbol Inicial.', 'Árbol Podado.']
accuracies = [0.8125, 0.7917]
f1s = [0.8179, 0.8019]

x = np.arange(len(modelos))
width = 0.35

fig, ax = plt.subplots(figsize = (8, 5))
ax.bar(x - width/2, accuracies, width, label = 'Accuracy', color = 'steelblue')
ax.bar(x + width/2, f1s, width, label = 'F1 weighted', color = 'coral')
ax.set_ylim(0.75, 0.85)
ax.set_xticks(x)
ax.set_xticklabels(modelos)
ax.legend()
ax.set_title('Comparación de Árbol Inicial vs Podado.')
plt.tight_layout()
plt.show()

	fecha	id_consulta	edad	sexo	peso	altura	municipio_unidad_medica	institucion_unidad_medica	clave_grupo_ enfermedad	descripcion_grupo_enfermedad	clave_enfermedad	descripcion_enfermedad
0	02/10/2024	SM_2024_38869	21	Masculino	82	174	LINARES	HOSPITAL GENERAL DE LINARES	V	TRASTORNOS MENTALES Y DEL COMPORTAMIENTO	F412	TRASTORNO MIXTO DE ANSIEDAD Y DEPRESION
1	08/10/2024	SM_2024_38870	21	Masculino	82	174	LINARES	HOSPITAL GENERAL DE LINARES	V	TRASTORNOS MENTALES Y DEL COMPORTAMIENTO	F412	TRASTORNO MIXTO DE ANSIEDAD Y DEPRESION
2	08/10/2024	SM_2024_38871	5	Masculino	21	111	LINARES	HOSPITAL GENERAL DE LINARES	V	TRASTORNOS MENTALES Y DEL COMPORTAMIENTO	F919	TRASTORNO DE LA CONDUCTA NO ESPECIFICADO
3	09/10/2024	SM_2024_38872	69	Masculino	sin valor	sin valor	LINARES	HOSPITAL GENERAL DE LINARES	V	TRASTORNOS MENTALES Y DEL COMPORTAMIENTO	F321	EPISODIO DEPRESIVO MODERADO
4	09/10/2024	SM_2024_38873	78	Masculino	sin valor	sin valor	LINARES	HOSPITAL GENERAL DE LINARES	V	TRASTORNOS MENTALES Y DEL COMPORTAMIENTO	F321	EPISODIO DEPRESIVO MODERADO

	fecha	id_consulta	edad	sexo	peso	altura	municipio_unidad_medica	institucion_unidad_medica	clave_grupo_ enfermedad	descripcion_grupo_enfermedad	clave_enfermedad	descripcion_enfermedad
2625	02/10/2024	SM_2024_41494	33	Masculino	60	160	SAN PEDRO GARZA GARCIA	CENTRO COMUNITARIO DE SALUD MENTAL Y ADICCIONE...	V	TRASTORNOS MENTALES Y DEL COMPORTAMIENTO	F122	TRASTORNOS MENTALES Y DEL COMPORTAMIENTO DEBID...
2626	02/10/2024	SM_2024_41495	53	Femenino	60	160	SAN PEDRO GARZA GARCIA	CENTRO COMUNITARIO DE SALUD MENTAL Y ADICCIONE...	V	TRASTORNOS MENTALES Y DEL COMPORTAMIENTO	F412	TRASTORNO MIXTO DE ANSIEDAD Y DEPRESION
2627	02/10/2024	SM_2024_41496	60	Femenino	60	160	SAN PEDRO GARZA GARCIA	CENTRO COMUNITARIO DE SALUD MENTAL Y ADICCIONE...	V	TRASTORNOS MENTALES Y DEL COMPORTAMIENTO	F630	JUEGO PATOLOGICO
2628	04/10/2024	SM_2024_41497	46	Femenino	70	165	SAN PEDRO GARZA GARCIA	CENTRO COMUNITARIO DE SALUD MENTAL Y ADICCIONE...	V	TRASTORNOS MENTALES Y DEL COMPORTAMIENTO	F412	TRASTORNO MIXTO DE ANSIEDAD Y DEPRESION
2629	04/10/2024	SM_2024_41498	47	Femenino	60	160	SAN PEDRO GARZA GARCIA	CENTRO COMUNITARIO DE SALUD MENTAL Y ADICCIONE...	V	TRASTORNOS MENTALES Y DEL COMPORTAMIENTO	F412	TRASTORNO MIXTO DE ANSIEDAD Y DEPRESION

	fecha	id_consulta	edad	sexo	peso	altura	municipio_unidad_medica	institucion_unidad_medica	clave_grupo_ enfermedad	descripcion_grupo_enfermedad	clave_enfermedad	descripcion_enfermedad
2625	02/10/2024	SM_2024_41494	33	Masculino	60	160	SAN PEDRO GARZA GARCIA	CENTRO COMUNITARIO DE SALUD MENTAL Y ADICCIONE...	V	TRASTORNOS MENTALES Y DEL COMPORTAMIENTO	F122	TRASTORNOS MENTALES Y DEL COMPORTAMIENTO DEBID...
2626	02/10/2024	SM_2024_41495	53	Femenino	60	160	SAN PEDRO GARZA GARCIA	CENTRO COMUNITARIO DE SALUD MENTAL Y ADICCIONE...	V	TRASTORNOS MENTALES Y DEL COMPORTAMIENTO	F412	TRASTORNO MIXTO DE ANSIEDAD Y DEPRESION
2627	02/10/2024	SM_2024_41496	60	Femenino	60	160	SAN PEDRO GARZA GARCIA	CENTRO COMUNITARIO DE SALUD MENTAL Y ADICCIONE...	V	TRASTORNOS MENTALES Y DEL COMPORTAMIENTO	F630	JUEGO PATOLOGICO
2628	04/10/2024	SM_2024_41497	46	Femenino	70	165	SAN PEDRO GARZA GARCIA	CENTRO COMUNITARIO DE SALUD MENTAL Y ADICCIONE...	V	TRASTORNOS MENTALES Y DEL COMPORTAMIENTO	F412	TRASTORNO MIXTO DE ANSIEDAD Y DEPRESION
2629	04/10/2024	SM_2024_41498	47	Femenino	60	160	SAN PEDRO GARZA GARCIA	CENTRO COMUNITARIO DE SALUD MENTAL Y ADICCIONE...	V	TRASTORNOS MENTALES Y DEL COMPORTAMIENTO	F412	TRASTORNO MIXTO DE ANSIEDAD Y DEPRESION

	fecha	id_consulta	edad	sexo	peso	altura	municipio_unidad_medica	institucion_unidad_medica	clave_grupo_ enfermedad	descripcion_grupo_enfermedad	clave_enfermedad	descripcion_enfermedad	sexo_num	institucion_unidad_medica_num	descripcion_grupo_enfermedad_num	clave_enfermedad_num	descripcion_enfermedad_num
23330	31/03/2025	SM_2025_10466	42	Femenino	54	158	SAN PEDRO GARZA GARCIA	CENTRO DE SALUD CON SERVICIOS AMPLIADOS SAN PE...	V	TRASTORNOS MENTALES Y DEL COMPORTAMIENTO	F411	TRASTORNO DE ANSIEDAD GENERALIZADA	0	1	1	31	29
32281	16/05/2025	SM_2025_19417	54	Femenino	66.6	152	SAN PEDRO GARZA GARCIA	CENTRO DE SALUD CON SERVICIOS AMPLIADOS SAN PE...	XXI	FACTORES QUE INFLUYEN EN EL ESTADO DE SALUD Y ...	Z630	PROBLEMAS EN LA RELACION ENTRE ESPOSOS O PAREJA	0	1	0	46	16
11234	05/12/2024	SM_2024_50103	62	Femenino	61.1	149	SAN PEDRO GARZA GARCIA	CENTRO DE SALUD CON SERVICIOS AMPLIADOS SAN PE...	V	TRASTORNOS MENTALES Y DEL COMPORTAMIENTO	F412	TRASTORNO MIXTO DE ANSIEDAD Y DEPRESION	0	1	1	32	38
36867	13/06/2025	SM_2025_24003	25	Masculino	80	170	SAN PEDRO GARZA GARCIA	CENTRO COMUNITARIO DE SALUD MENTAL Y ADICCIONE...	V	TRASTORNOS MENTALES Y DEL COMPORTAMIENTO	F412	TRASTORNO MIXTO DE ANSIEDAD Y DEPRESION	1	0	1	32	38
32315	27/05/2025	SM_2025_19451	22	Femenino	65	160	SAN PEDRO GARZA GARCIA	CENTRO DE SALUD CON SERVICIOS AMPLIADOS SAN PE...	XXI	FACTORES QUE INFLUYEN EN EL ESTADO DE SALUD Y ...	Z634	PROBLEMAS RELACIONADOS CON LA DESAPARICION O M...	0	1	0	50	22

Estatus de la Salud Mental en adultos del municipio más acaudalado de América Látina.¶

1.1 Introducción.¶

1.2 Objetivo.¶

2.1 Descripción del conjunto de datos.¶

2.2 Preparación y limpieza del conjunto de datos.¶

3.1 Partición de los datos.¶

4.1 LDA.¶

5.1 Árbol de Decisión.¶

5.2 Árbol de Decisión Podado.¶

6.1 Resultados.¶

7.1 Discusiones.¶

8.1 Conclusiones.¶

8.2 Aprendizajes.¶

8.3 Implicaciones.¶

8.4 Posibles líneas futuras.¶

Referencias.¶

9.1 Código de Honor de la Universidad de Monterrey.¶