Seungwan's repository

Showing posts with label P-Machine learning. Show all posts

Mar 10, 2024

Boosting tree - Xgboost + GridSearchCV

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

from sklearn.metrics import r2_score, mean_squared_error
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV

# Install ...
!pip install xgboost

# Data load
df = pd.read_csv('./data/xxx.csv')

# Target data separation
y = df['y']
X = df.drop('y', axis = 1)

# Split train : test data set = 7:3 and set random seed
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size = 0.3, 
                                                    random_state = 42)

# Define a model structure
xgbr = XGBRegressor(random_state = 42)

# Set the range of parameters
params = {
    'max_depth' : [3, 5, 7, 9],
    'n_estimators' : [50, 70, 90, 100],
    'learning_rate' : [0.03, 0.01],
    'subsample' : [0.7, 0.8, 0.9],
    'colsample_bytree' : [0.7, 0.8, 0.9]
    # 'reg_alpha' : [0, 1],
    # 'reg_lambda' : [0, 1]
}

# Set the condition of grid search model
grid = GridSearchCV(estimator = xgbr,
                   param_grid = params,
                   scoring = 'r2',
                   n_jobs = -1, # 최대 사용
                   verbose = 2,
                   cv = 5)

# Train the grid search model
grid.fit(X_train, y_train)

# Confirm the optimal parameters
grid.best_params_

# Model prediction using those optimal parameters
best_pred = grid.predict(X_test)

# Error metric
print("r2 score:", r2_score(y_test, best_pred))
print("RMSE:", np.sqrt(mean_squared_error(y_test, best_pred)))

Mar 2, 2024

Decision tree regressor

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score, mean_squared_error

# Data load
df = pd.read_csv('./data/xxx.csv')

# Target data separation
y = df['y']
X = df.drop('y', axis = 1)

# Split train : test data set = 7:3 and set random seed
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size = 0.3, 
                                                    random_state = 42)

# Define a model structure
dtr = DecisionTreeRegressor(max_depth = 16, random_state = 42)

# Train model
dtr.fit(X_train, y_train)

# Model prediction
dtr_pred = dtr.predict(X_test)

# Error metric
print("Ridge-r2 score:", r2_score(y_test, dtr_pred))
print("Ridge-RMSE:", np.sqrt(mean_squared_error(y_test, dtr_pred)))

# Plot
plt.bar(X_train.columns, dtr.feature_importances_)
plot_tree(dtr);

Mar 1, 2024

Decision tree classifier

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.tree import plot_tree

# Data load
from sklearn.datasets import load_iris
iris = load_iris()

# Data separation
X = pd.DataFrame(iris['data'], columns = iris['feature_names'])
y = iris['target']

# Split train : test data set = 7:3 and set random seed
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size = 0.3, 
                                                    random_state = 42,
                                                    stratify = y)

# Define a model structure
dtc = DecisionTreeClassifier(max_depth = 3, random_state = 42)

# Train model
dtc.fit(X_train, y_train)

# Model prediction
dtc_pred = dtc.predict(X_test)

# Error metric
print(confusion_matrix(y_test, dtc_pred))
print(classification_report(y_test, dtc_pred))

# Confirm coefficient
dtc.feature_importances_

# Plot
plt.bar(X_train.columns, dtc.feature_importances_)
plt.figure(figsize=(8, 8))
plot_tree(dtc,
         feature_names = iris['feature_names'],
         class_names = iris['target_names'],
         filled = True
          );

Feb 27, 2024

Rolynominal regression + Lasso regression (다항 회귀)

Ref. : https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.PolynomialFeatures.html

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error

# Data load
df = pd.read_csv('./data/xxx.csv')

# Target data separation
y = df['y']
X = df.drop('y', axis = 1)

# Split train : test data set = 7:3 and set random seed
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size = 0.3, 
                                                    random_state = 42)


# Define a model structure for a polynomial variable
poly = PolynomialFeatures(degree = 2, include_bias = False) 

# Train polynomial variable
poly.fit(X_train)

# Show feature names
poly.get_feature_names_out()
# poly.get_feature_names()
pd.options.display.max_columns = 200

# Change the data frame to the polynomial variables
poly_X_train = pd.DataFrame(poly.transform(X_train), 
                            columns = poly.get_feature_names_out())

# poly_X_train = pd.DataFrame(poly.transform(X_train), 
#                             columns = poly.get_feature_names())

# Change the test data to the polynomial variables
poly_X_test = poly.transform(X_test)

# Define a model structure
ls = Lasso() # To reduce some low coefficients

# Train model
ls.fit(poly_X_train, y_train)

# Model prediction
ls_pred = ls.predict(poly_X_test)

# Confirm coefficient
ls.coef_

# Error metric
print("Lasso-r2 score:", r2_score(y_test, ls_pred))
print("Lasso-RMSE:", np.sqrt(mean_squared_error(y_test, ls_pred)))

# Plot
plt.figure(figsize=(20, 20))
plt.barh(poly.get_feature_names_out(), ls.coef_)

Feb 26, 2024

Ridge regression

Linear least squares with l2 regularization.

Ref.: https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Ridge.html

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error

# Data load
df = pd.read_csv('./data/xxx.csv')

# Target data separation
y = df['y']
X = df.drop('y', axis = 1)

# Split train : test data set = 7:3 and set random seed
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size = 0.3, 
                                                    random_state = 42)

# Define model structure

rg = Ridge(alpha = 0.1)

# Train model

rg.fit(X_train, y_train)

# Model prediction

rg_pred = rg.predict(X_test)

# Confirm coefficient
rg.coef_

# Error metric

print("Ridge-r2 score:", r2_score(y_test, rg_pred))
print("Ridge-RMSE:", np.sqrt(mean_squared_error(y_test, rg_pred)))

# Plot

plt.bar(X_train.columns, rg.coef_)

Feb 24, 2024

Linear regression (기본 선형회귀 모델)

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error

# Data load

df = pd.read_csv('./data/xxx.csv')

# Target data separation

y = df['y']

X = df.drop('y', axis = 1)

# Split train : test data set = 7:3 and set random seed

X_train, X_test, y_train, y_test = train_test_split(X,

test_size = 0.3,

random_state = 42)

# Define model structure
lr = LinearRegression()

# Train model
lr.fit(X_train, y_train)

# Model prediction
lr_pred = lr.predict(X_test)

# Confirm coefficient
lr.coef_

# Error metric
print("r2 score:", r2_score(y_test, lr_pred))
print("RMSE:", np.sqrt(mean_squared_error(y_test, lr_pred)))

# Plot
plt.scatter(lr_pred, y_test)