본문 바로가기
PYTHON/빅데이터분석기사

ML_04 (regression s4-11~18 웹사이트 방문자 예측)

by 쿠룽지 2023. 11. 20.
728x90
반응형

 

학습을 위한 코드구문 제외

 

#학습할 때 features는 corr값이 높은 컬럼들을 다 뺐지만 사실 id빼고 다 써도 무관
#import
import pandas as pd
import numpy as np
 # 뭐가 맞는지 몰라서 일단 .. pd.options.display.max_rows=500
 # pd.set_option('display.max_columns', 20)
pd.set_option('display.float_format', '{:.4f}'.format)
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.preprocessing import MinMaxScaler

from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_squared_log_error as msle

#함수
#1 성능평가
def get_scores_f(model, xtrain, xtest, ytrain, ytest):
    pred1 = model.predict(xtrain)   # train
    pred2 = model.predict(xtest)    # test

    # 음수가 있어도 상관없음
    A1 = r2_score(ytrain, pred1) # train
    A2 = r2_score(ytest, pred2)

    # msle, rmsle (log) 는 음수가 있어서는 안됨
    pred2 = np.where(pred2<0, 0, pred2)
    B = msle(ytest, pred2)
    C = np.sqrt(B)   # rmsle
    data = [round(x, 4) for x in [A1, A2, C]]
    names = 'r2_train r2_test rmsle'.split()
    scores = pd.Series(data, index=names)
    return  scores

#2 모델 생성
def make_models_f(xtrain, xtest, ytrain, ytest, n=300):
    temp = pd.DataFrame()

    #LinearRegression
    model1 = LinearRegression().fit(xtrain, ytrain)
    temp['model1'] = get_scores_f(model1, xtrain, xtest, ytrain, ytest)

    #DecisionTreeRegressor
    model2 = DecisionTreeRegressor(random_state=0).fit(xtrain, ytrain)
    temp['model2'] = get_scores_f(model2, xtrain, xtest, ytrain, ytest)

    for d in range(3, 9):
        model2 = DecisionTreeRegressor(max_depth=d, random_state=0).fit(xtrain, ytrain)
        temp[f'model2_{d}'] = get_scores_f(model2, xtrain, xtest, ytrain, ytest)

    #RandomForestRegressor
    model3 = RandomForestRegressor(n, random_state=0).fit(xtrain, ytrain)
    temp['model3'] = get_scores_f(model3, xtrain, xtest, ytrain, ytest)

    for d in range(3, 9):
        model3 = RandomForestRegressor(n, max_depth=d, random_state=0).fit(xtrain, ytrain)
        temp[f'model3_{d}'] = get_scores_f(model3, xtrain, xtest, ytrain, ytest)

    #XGBRegressor
    model4 = XGBRegressor(objective='reg:squarederror').fit(xtrain, ytrain)
    temp['model4'] = get_scores_f(model4, xtrain, xtest, ytrain, ytest)

    temp = temp.T
    temp.insert(2, 'diff', (temp['r2_train'] - temp['r2_test']).abs())

    return temp

#데이터 읽기
X_use = pd.read_csv('x_train.csv')
X_submission = pd.read_csv('x_test.csv')
Y = pd.read_csv('y_train.csv')

#dfX 만들기 (제출/훈련용 데이터 같이 전처리)
dfX = pd.concat([X_use, X_submission], ignore_index=True, axis=0)
#dfX.info()   결측치는 없는데  day, date, page_loads, first_time_visits, returning_visits -> object

#전처리 후 dfX
names = ['page_loads', 'first_time_visits', 'returning_visits']
dfX[names] = dfX[names].replace(',', '', regex=True).astype(int) # 값에 ,를 없애고 int로 형변환
dfX['date'] = pd.to_datetime(dfX['date'], format='%m/%d/%Y')
dfX2 = dfX.drop(columns='day') #값이 같아서 삭제

date = pd.DataFrame()
temp = dfX2['date'].dt # temp에 dfX2의 date를 dt형식으로 넣음
date['year'] = temp.year #temp.year를 date['year']에
date['month'] = temp.month
date['day'] = temp.day
dfX3 = dfX2.drop(columns='date') #dfX2에서 date컬럼을 삭제한 값을 dfX3에 
dfX3 = pd.concat([dfX3, date], axis=1)
#print(dfX3.info()) # row, day_of_week, page_loads, first_time_visits, returning_visits, year, month, day
#dfXY = pd.merge(dfX3, Y) 해도 되고 안해도 되는 과정이라서
#print(dfXY.corr()['unique_visits'])

# print(dfX3.columns) > row 빼고 dfX4 생성
Xfeatures = ['day_of_week', 'page_loads', 'first_time_visits',
             'returning_visits', 'year', 'month', 'day']
dfX4 = dfX3[Xfeatures]

train_size = len(X_use)
XF = dfX4[:train_size]
X_submissionF = dfX4[train_size:]
YF = Y['unique_visits']

# print([x.shape for x in [XF, X_submissionF, YF]]) >> [(1301, 7), (866, 7), (1301,)]

xtrain, xtest, ytrain, ytest = train_test_split(XF, YF, test_size = 0.3, random_state=1234)
# print([x.shape for x in [xtrain, xtest, ytrain, ytest]]) >> [(910, 7), (391, 7), (910,), (391,)]

models = make_models_f(xtrain, xtest, ytrain, ytest)
#print(models)
#print(models.sort_values('rmsle').head(10))

#모델 결정
model = XGBRegressor(objective='reg:squarederror').fit(xtrain, ytrain)
print(get_scores_f(model, xtrain, xtest, ytrain, ytest))
pred = model.predict(X_submissionF)

#원하는 값만 저장
submission = pd.DataFrame({'row': X_submission['row'], 'unique_visits':pred})
print(submission.head())

submission.to_csv('000001000.csv', index=False)

 

728x90
반응형