728x90
반응형
학습을 위한 코드구문 제외
#import
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_squared_log_error as msle
#함수
#1 성능평가
def get_scores_f(model, xtrain, xtest, ytrain, ytest):
pred1 = model.predict(xtrain) # train
pred2 = model.predict(xtest) # test
# 음수가 있어도 상관없음
A1 = r2_score(ytrain, pred1) # train
A2 = r2_score(ytest, pred2)
# msle, rmsle (log) 는 음수가 있어서는 안됨
pred2 = np.where(pred2<0, 0, pred2)
B = msle(ytest, pred2)
C = np.sqrt(B) # rmsle
data = [round(x, 4) for x in [A1, A2, C]]
names = 'r2_train r2_test rmsle'.split()
scores = pd.Series(data, index=names)
return scores
#2 모델 생성
def make_models_f(xtrain, xtest, ytrain, ytest, n=300):
temp = pd.DataFrame()
#LinearRegression
model1 = LinearRegression().fit(xtrain, ytrain)
temp['model1'] = get_scores_f(model1, xtrain, xtest, ytrain, ytest)
#DecisionTreeRegressor
model2 = DecisionTreeRegressor(random_state=0).fit(xtrain, ytrain)
temp['model2'] = get_scores_f(model2, xtrain, xtest, ytrain, ytest)
for d in range(3, 9):
model2 = DecisionTreeRegressor(max_depth=d, random_state=0).fit(xtrain, ytrain)
temp[f'model2_{d}'] = get_scores_f(model2, xtrain, xtest, ytrain, ytest)
#RandomForestRegressor
model3 = RandomForestRegressor(n, random_state=0).fit(xtrain, ytrain)
temp['model3'] = get_scores_f(model3, xtrain, xtest, ytrain, ytest)
for d in range(3, 9):
model3 = RandomForestRegressor(n, max_depth=d, random_state=0).fit(xtrain, ytrain)
temp[f'model3_{d}'] = get_scores_f(model3, xtrain, xtest, ytrain, ytest)
#XGBRegressor
model4 = XGBRegressor(objective='reg:squarederror').fit(xtrain, ytrain)
temp['model4'] = get_scores_f(model4, xtrain, xtest, ytrain, ytest)
temp = temp.T
temp.insert(2, 'diff', (temp['r2_train'] - temp['r2_test']).abs())
return temp
#데이터 읽기
X_use = pd.read_csv('x_train.csv')
X_submission = pd.read_csv('x_test.csv')
Y = pd.read_csv('y_train.csv')
#print([x.shape for x in [X_use, X_submission, Y]]) # [(803, 7), (535, 7), (803, 2)]
#X_use + X_submission = dfX
dfX = pd.concat([X_use, X_submission], ignore_index=True, axis=0)
#dfX.info() >> 결측치는 없고 sex, smoker, region이 object 타입
#print(dfX.nunique())
for x in ['sex', 'smoker', 'region']:
temp = dfX[x].unique()
dfX[x] = dfX[x].replace(temp, range(len(temp)))
#dfX.info() 확인
#print(dfX.columns)
dfX2 = dfX.drop(columns='ID')
#dfX2.columns
train_size = len(X_use)
XF = dfX2[:train_size] # dfX2가 DataFrame, ndarry 인 경우도 행번호로 indexing 가능
X_submissionF = dfX2[train_size:]
YF = Y['charges']
#print([x.shape for x in [XF, YF, X_submissionF]])
xtrain, xtest, ytrain, ytest = train_test_split(XF, YF, test_size=0.3, random_state=1234)
# make_models_f 함수 호출 부분은
# 실제 시험에서 제출 전에 꼭 주석을 취해 주세요 ^_^ - 실행 시간 초과 금지
#models = make_models_f(xtrain, xtest, ytrain, ytest)
#print(models.sort_values('rmsle').head(5))
#모델 결정 후 뽑기
model = RandomForestRegressor(300, max_depth=5, random_state=0).fit(xtrain, ytrain)
print(get_scores_f(model, xtrain, xtest, ytrain, ytest))
pred = model.predict(X_submissionF)
submission = pd.DataFrame({'ID': X_submission['ID'],
'charges' : pred})
#submission.head()
#파일로 저장
submission.to_csv('0001000.csv', index=False)
728x90
반응형
'PYTHON > 빅데이터분석기사' 카테고리의 다른 글
[캐글-퇴근후딴짓] T3 문제풀이 (코드 저장용) (0) | 2023.11.29 |
---|---|
빅데이터분석기사 제7회 공지 (0) | 2023.11.26 |
ML_04 (regression s4-11~18 웹사이트 방문자 예측) (1) | 2023.11.20 |
ML_04 (regression s4-01~10 회귀 모델링) (2) | 2023.11.19 |
11.15일 (model-1 DB연동/ 회원관리 및 게시판) (1) | 2023.11.15 |