Jun 24, 2025

[Python] module vs package

* Module(모듈): .py 파일 하나 (책 한권)

  - ex: math.py, os.py

* Package(패키지): __init__.py가 있는 폴더

  - 여러 모듈을 포함한 폴더 (책장이 있는 책꽂이)

  - __init__.py 파일이 있어야 패키지로 인식됨

  - numpy, sklearn, ...



Mar 10, 2024

Boosting tree - Xgboost + GridSearchCV


import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV

# Install ...
!pip install xgboost

# Data load
df = pd.read_csv('./data/xxx.csv')

# Target data separation
y = df['y']
X = df.drop('y', axis = 1)

# Split train : test data set = 7:3 and set random seed
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size = 0.3,
                                                    random_state = 42)

# Define a model structure
xgbr = XGBRegressor(random_state = 42)

# Set the range of parameters
params = {
    'max_depth' : [3, 5, 7, 9],
    'n_estimators' : [50, 70, 90, 100],
    'learning_rate' : [0.03, 0.01],
    'subsample' : [0.7, 0.8, 0.9],
    'colsample_bytree' : [0.7, 0.8, 0.9]
    # 'reg_alpha' : [0, 1],
    # 'reg_lambda' : [0, 1]
}

# Set the condition of grid search model
grid = GridSearchCV(estimator = xgbr,
                   param_grid = params,
                   scoring = 'r2',
                   n_jobs = -1, # 최대 사용
                   verbose = 2,
                   cv = 5)

# Train the grid search model
grid.fit(X_train, y_train)

# Confirm the optimal parameters
grid.best_params_

# Model prediction using those optimal parameters
best_pred = grid.predict(X_test)

# Error metric
print("r2 score:", r2_score(y_test, best_pred))
print("RMSE:", np.sqrt(mean_squared_error(y_test, best_pred)))

Mar 2, 2024

Decision tree regressor


import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score, mean_squared_error

# Data load
df = pd.read_csv('./data/xxx.csv')

# Target data separation
y = df['y']
X = df.drop('y', axis = 1)

# Split train : test data set = 7:3 and set random seed
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size = 0.3,
                                                    random_state = 42)

# Define a model structure
dtr = DecisionTreeRegressor(max_depth = 16, random_state = 42)

# Train model
dtr.fit(X_train, y_train)

# Model prediction
dtr_pred = dtr.predict(X_test)

# Error metric
print("Ridge-r2 score:", r2_score(y_test, dtr_pred))
print("Ridge-RMSE:", np.sqrt(mean_squared_error(y_test, dtr_pred)))

# Plot
plt.bar(X_train.columns, dtr.feature_importances_)
plot_tree(dtr);