본문 바로가기
PYTHON/빅데이터분석기사

ML_04 (regression s4-19~21 보험 예측)

by 쿠룽지 2023. 11. 20.
728x90
반응형

 

 

학습을 위한 코드구문 제외

 

 

#import
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.preprocessing import MinMaxScaler

from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_squared_log_error as msle

#함수
#1 성능평가
def get_scores_f(model, xtrain, xtest, ytrain, ytest):
    pred1 = model.predict(xtrain)   # train
    pred2 = model.predict(xtest)    # test

    # 음수가 있어도 상관없음
    A1 = r2_score(ytrain, pred1) # train
    A2 = r2_score(ytest, pred2)

    # msle, rmsle (log) 는 음수가 있어서는 안됨
    pred2 = np.where(pred2<0, 0, pred2)
    B = msle(ytest, pred2)
    C = np.sqrt(B)   # rmsle
    data = [round(x, 4) for x in [A1, A2, C]]
    names = 'r2_train r2_test rmsle'.split()
    scores = pd.Series(data, index=names)
    return  scores

#2 모델 생성
def make_models_f(xtrain, xtest, ytrain, ytest, n=300):
    temp = pd.DataFrame()

    #LinearRegression
    model1 = LinearRegression().fit(xtrain, ytrain)
    temp['model1'] = get_scores_f(model1, xtrain, xtest, ytrain, ytest)

    #DecisionTreeRegressor
    model2 = DecisionTreeRegressor(random_state=0).fit(xtrain, ytrain)
    temp['model2'] = get_scores_f(model2, xtrain, xtest, ytrain, ytest)

    for d in range(3, 9):
        model2 = DecisionTreeRegressor(max_depth=d, random_state=0).fit(xtrain, ytrain)
        temp[f'model2_{d}'] = get_scores_f(model2, xtrain, xtest, ytrain, ytest)

    #RandomForestRegressor
    model3 = RandomForestRegressor(n, random_state=0).fit(xtrain, ytrain)
    temp['model3'] = get_scores_f(model3, xtrain, xtest, ytrain, ytest)

    for d in range(3, 9):
        model3 = RandomForestRegressor(n, max_depth=d, random_state=0).fit(xtrain, ytrain)
        temp[f'model3_{d}'] = get_scores_f(model3, xtrain, xtest, ytrain, ytest)

    #XGBRegressor
    model4 = XGBRegressor(objective='reg:squarederror').fit(xtrain, ytrain)
    temp['model4'] = get_scores_f(model4, xtrain, xtest, ytrain, ytest)

    temp = temp.T
    temp.insert(2, 'diff', (temp['r2_train'] - temp['r2_test']).abs())

    return temp

#데이터 읽기
X_use = pd.read_csv('x_train.csv')
X_submission = pd.read_csv('x_test.csv')
Y = pd.read_csv('y_train.csv')
#print([x.shape for x in [X_use, X_submission, Y]]) # [(803, 7), (535, 7), (803, 2)]

#X_use + X_submission = dfX
dfX = pd.concat([X_use, X_submission], ignore_index=True, axis=0)
#dfX.info() >> 결측치는 없고 sex, smoker, region이 object 타입
#print(dfX.nunique())

for x in ['sex', 'smoker', 'region']:
	temp = dfX[x].unique()
    dfX[x] = dfX[x].replace(temp, range(len(temp)))
#dfX.info() 확인

#print(dfX.columns)
dfX2 = dfX.drop(columns='ID')
#dfX2.columns

train_size = len(X_use)
XF = dfX2[:train_size]  #  dfX2가 DataFrame, ndarry 인 경우도 행번호로 indexing 가능
X_submissionF = dfX2[train_size:]
YF = Y['charges']
#print([x.shape for x in [XF, YF, X_submissionF]])

xtrain, xtest, ytrain, ytest = train_test_split(XF, YF, test_size=0.3, random_state=1234)



# make_models_f 함수 호출 부분은
# 실제 시험에서 제출 전에 꼭 주석을 취해 주세요 ^_^  - 실행 시간 초과 금지

#models = make_models_f(xtrain, xtest, ytrain, ytest)
#print(models.sort_values('rmsle').head(5))


#모델 결정 후 뽑기
model = RandomForestRegressor(300, max_depth=5, random_state=0).fit(xtrain, ytrain)
print(get_scores_f(model, xtrain, xtest, ytrain, ytest))
pred = model.predict(X_submissionF)

submission = pd.DataFrame({'ID': X_submission['ID'],
                           'charges' : pred})
#submission.head()
#파일로 저장
submission.to_csv('0001000.csv', index=False)
728x90
반응형