Ref. : https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.PolynomialFeatures.html
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
# Data load
df = pd.read_csv('./data/xxx.csv')
# Target data separation
y = df['y']
X = df.drop('y', axis = 1)
# Split train : test data set = 7:3 and set random seed
X_train, X_test, y_train, y_test = train_test_split(X,
y,
test_size = 0.3,
random_state = 42)
# Define a model structure for a polynomial variable
poly = PolynomialFeatures(degree = 2, include_bias = False)
# Train polynomial variable
poly.fit(X_train)
# Show feature names
poly.get_feature_names_out()
# poly.get_feature_names()
pd.options.display.max_columns = 200
# Change the data frame to the polynomial variables
poly_X_train = pd.DataFrame(poly.transform(X_train),
columns = poly.get_feature_names_out())
# poly_X_train = pd.DataFrame(poly.transform(X_train),
# columns = poly.get_feature_names())
# Change the test data to the polynomial variables
poly_X_test = poly.transform(X_test)
# Define a model structure
ls = Lasso() # To reduce some low coefficients
# Train model
ls.fit(poly_X_train, y_train)
# Model prediction
ls_pred = ls.predict(poly_X_test)
# Confirm coefficient
ls.coef_
# Error metric
print("Lasso-r2 score:", r2_score(y_test, ls_pred))
print("Lasso-RMSE:", np.sqrt(mean_squared_error(y_test, ls_pred)))
# Plot
plt.figure(figsize=(20, 20))
plt.barh(poly.get_feature_names_out(), ls.coef_)