728x90
반응형
학습을 위한 코드구문 제외
#학습할 때 features는 corr값이 높은 컬럼들을 다 뺐지만 사실 id빼고 다 써도 무관
#import
import pandas as pd
import numpy as np
# 뭐가 맞는지 몰라서 일단 .. pd.options.display.max_rows=500
# pd.set_option('display.max_columns', 20)
pd.set_option('display.float_format', '{:.4f}'.format)
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_squared_log_error as msle
#함수
#1 성능평가
def get_scores_f(model, xtrain, xtest, ytrain, ytest):
pred1 = model.predict(xtrain) # train
pred2 = model.predict(xtest) # test
# 음수가 있어도 상관없음
A1 = r2_score(ytrain, pred1) # train
A2 = r2_score(ytest, pred2)
# msle, rmsle (log) 는 음수가 있어서는 안됨
pred2 = np.where(pred2<0, 0, pred2)
B = msle(ytest, pred2)
C = np.sqrt(B) # rmsle
data = [round(x, 4) for x in [A1, A2, C]]
names = 'r2_train r2_test rmsle'.split()
scores = pd.Series(data, index=names)
return scores
#2 모델 생성
def make_models_f(xtrain, xtest, ytrain, ytest, n=300):
temp = pd.DataFrame()
#LinearRegression
model1 = LinearRegression().fit(xtrain, ytrain)
temp['model1'] = get_scores_f(model1, xtrain, xtest, ytrain, ytest)
#DecisionTreeRegressor
model2 = DecisionTreeRegressor(random_state=0).fit(xtrain, ytrain)
temp['model2'] = get_scores_f(model2, xtrain, xtest, ytrain, ytest)
for d in range(3, 9):
model2 = DecisionTreeRegressor(max_depth=d, random_state=0).fit(xtrain, ytrain)
temp[f'model2_{d}'] = get_scores_f(model2, xtrain, xtest, ytrain, ytest)
#RandomForestRegressor
model3 = RandomForestRegressor(n, random_state=0).fit(xtrain, ytrain)
temp['model3'] = get_scores_f(model3, xtrain, xtest, ytrain, ytest)
for d in range(3, 9):
model3 = RandomForestRegressor(n, max_depth=d, random_state=0).fit(xtrain, ytrain)
temp[f'model3_{d}'] = get_scores_f(model3, xtrain, xtest, ytrain, ytest)
#XGBRegressor
model4 = XGBRegressor(objective='reg:squarederror').fit(xtrain, ytrain)
temp['model4'] = get_scores_f(model4, xtrain, xtest, ytrain, ytest)
temp = temp.T
temp.insert(2, 'diff', (temp['r2_train'] - temp['r2_test']).abs())
return temp
#데이터 읽기
X_use = pd.read_csv('x_train.csv')
X_submission = pd.read_csv('x_test.csv')
Y = pd.read_csv('y_train.csv')
#dfX 만들기 (제출/훈련용 데이터 같이 전처리)
dfX = pd.concat([X_use, X_submission], ignore_index=True, axis=0)
#dfX.info() 결측치는 없는데 day, date, page_loads, first_time_visits, returning_visits -> object
#전처리 후 dfX
names = ['page_loads', 'first_time_visits', 'returning_visits']
dfX[names] = dfX[names].replace(',', '', regex=True).astype(int) # 값에 ,를 없애고 int로 형변환
dfX['date'] = pd.to_datetime(dfX['date'], format='%m/%d/%Y')
dfX2 = dfX.drop(columns='day') #값이 같아서 삭제
date = pd.DataFrame()
temp = dfX2['date'].dt # temp에 dfX2의 date를 dt형식으로 넣음
date['year'] = temp.year #temp.year를 date['year']에
date['month'] = temp.month
date['day'] = temp.day
dfX3 = dfX2.drop(columns='date') #dfX2에서 date컬럼을 삭제한 값을 dfX3에
dfX3 = pd.concat([dfX3, date], axis=1)
#print(dfX3.info()) # row, day_of_week, page_loads, first_time_visits, returning_visits, year, month, day
#dfXY = pd.merge(dfX3, Y) 해도 되고 안해도 되는 과정이라서
#print(dfXY.corr()['unique_visits'])
# print(dfX3.columns) > row 빼고 dfX4 생성
Xfeatures = ['day_of_week', 'page_loads', 'first_time_visits',
'returning_visits', 'year', 'month', 'day']
dfX4 = dfX3[Xfeatures]
train_size = len(X_use)
XF = dfX4[:train_size]
X_submissionF = dfX4[train_size:]
YF = Y['unique_visits']
# print([x.shape for x in [XF, X_submissionF, YF]]) >> [(1301, 7), (866, 7), (1301,)]
xtrain, xtest, ytrain, ytest = train_test_split(XF, YF, test_size = 0.3, random_state=1234)
# print([x.shape for x in [xtrain, xtest, ytrain, ytest]]) >> [(910, 7), (391, 7), (910,), (391,)]
models = make_models_f(xtrain, xtest, ytrain, ytest)
#print(models)
#print(models.sort_values('rmsle').head(10))
#모델 결정
model = XGBRegressor(objective='reg:squarederror').fit(xtrain, ytrain)
print(get_scores_f(model, xtrain, xtest, ytrain, ytest))
pred = model.predict(X_submissionF)
#원하는 값만 저장
submission = pd.DataFrame({'row': X_submission['row'], 'unique_visits':pred})
print(submission.head())
submission.to_csv('000001000.csv', index=False)
728x90
반응형
'PYTHON > 빅데이터분석기사' 카테고리의 다른 글
빅데이터분석기사 제7회 공지 (0) | 2023.11.26 |
---|---|
ML_04 (regression s4-19~21 보험 예측) (2) | 2023.11.20 |
ML_04 (regression s4-01~10 회귀 모델링) (2) | 2023.11.19 |
11.15일 (model-1 DB연동/ 회원관리 및 게시판) (1) | 2023.11.15 |
ML_03 (classification s3-26~30 다항 분류) (0) | 2023.11.15 |