아카이브

모델링 코드 실습(Regression, DT,RF) 본문

기계학습(Machine Learning)

모델링 코드 실습(Regression, DT,RF)

머루아빠승우 2024. 9. 24. 00:15

복습_최승우88_0922.html
2.14MB

IMPORT
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
# 모델 선택 전 데이터셋 train set과 test set 나눠야지
# 선형 회귀 모델 사용
# 트리 모델 사용
# 랜덤 포레스트 보델 사용
# 성능 평가지표 MSE, R^2 활성화
회귀모델 사례
회귀 모델은 선형, 비선형 관계의 데이터들을 대상으로 이전 데이터를 통해 향후 데이터를 산정한다.

# 예제 데이터 생성: 선형, 비선형, 랜덤한 관계
np.random.seed(45)

# 1. 선형 관계 데이터 생성
X_linear = np.random.rand(100, 1) * 10  # X는 0~10 사이의 값
y_linear = -2 * X_linear + 5 + np.random.randn(100, 1) * 2  # y = -2*X + 5 + 노이즈

# 2. 비선형 관계 데이터 생성
X_nonlinear = np.random.rand(100, 1) * 10
y_nonlinear = X_nonlinear * X_nonlinear * X_nonlinear + np.random.randn(100, 1) * 10  # y = X^3 + 노이즈

# 3. 랜덤한 관계 데이터 생성
X_random = np.random.rand(100, 1) * 10
y_random = np.random.rand(100, 1) * 100  # y는 완전히 랜덤

# 모든 데이터를 결합하여 하나의 DataFrame으로 구성
data_linear = pd.DataFrame(np.hstack((X_linear, y_linear)), columns=["X", "y_linear"])
data_nonlinear = pd.DataFrame(np.hstack((X_nonlinear, y_nonlinear)), columns=["X", "y_nonlinear"])
data_random = pd.DataFrame(np.hstack((X_random, y_random)), columns=["X", "y_random"])

# 훈련 및 테스트 데이터로 분리
X_train_linear, X_test_linear, y_train_linear, y_test_linear = train_test_split(data_linear[["X"]], data_linear["y_linear"], test_size=0.2, random_state=42)
X_train_nonlinear, X_test_nonlinear, y_train_nonlinear, y_test_nonlinear = train_test_split(data_nonlinear[["X"]], data_nonlinear["y_nonlinear"], test_size=0.2, random_state=42)
X_train_random, X_test_random, y_train_random, y_test_random = train_test_split(data_random[["X"]], data_random["y_random"], test_size=0.2, random_state=42)

# 모델 초기화
lin_reg = LinearRegression()
tree_reg = DecisionTreeRegressor(random_state=42)
forest_reg = RandomForestRegressor(n_estimators=100, random_state=42)

# 성능 비교를 위한 함수 정의
def evaluate_model(model, X_train, X_test, y_train, y_test, label):
    model.fit(X_train, y_train)
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)

    mse_train = mean_squared_error(y_train, y_pred_train)
    mse_test = mean_squared_error(y_test, y_pred_test)
    r2_train = r2_score(y_train, y_pred_train)
    r2_test = r2_score(y_test, y_pred_test)

    return {
        "Model": label,
        "Train MSE": mse_train,
        "Test MSE": mse_test,
        "Train R2": r2_train,
        "Test R2": r2_test
    }

# 각 모델에 대해 선형, 비선형, 랜덤 데이터에서 성능 평가
results = []

# 선형 데이터
results.append(evaluate_model(lin_reg, X_train_linear, X_test_linear, y_train_linear, y_test_linear, "Linear Regression (Linear Data)"))
results.append(evaluate_model(tree_reg, X_train_linear, X_test_linear, y_train_linear, y_test_linear, "Decision Tree (Linear Data)"))
results.append(evaluate_model(forest_reg, X_train_linear, X_test_linear, y_train_linear, y_test_linear, "Random Forest (Linear Data)"))

# 비선형 데이터
results.append(evaluate_model(lin_reg, X_train_nonlinear, X_test_nonlinear, y_train_nonlinear, y_test_nonlinear, "Linear Regression (Nonlinear Data)"))
results.append(evaluate_model(tree_reg, X_train_nonlinear, X_test_nonlinear, y_train_nonlinear, y_test_nonlinear, "Decision Tree (Nonlinear Data)"))
results.append(evaluate_model(forest_reg, X_train_nonlinear, X_test_nonlinear, y_train_nonlinear, y_test_nonlinear, "Random Forest (Nonlinear Data)"))

# 랜덤 데이터
results.append(evaluate_model(lin_reg, X_train_random, X_test_random, y_train_random, y_test_random, "Linear Regression (Random Data)"))
results.append(evaluate_model(tree_reg, X_train_random, X_test_random, y_train_random, y_test_random, "Decision Tree (Random Data)"))
results.append(evaluate_model(forest_reg, X_train_random, X_test_random, y_train_random, y_test_random, "Random Forest (Random Data)"))

# 결과를 DataFrame으로 변환하여 시각적으로 보기
results_df = pd.DataFrame(results)
print(results_df)
                                Model    Train MSE      Test MSE  Train R2  \
0     Linear Regression (Linear Data)     3.779240      4.542516  0.903102   
1         Decision Tree (Linear Data)     0.000000      9.462669  1.000000   
2         Random Forest (Linear Data)     0.679465      7.793680  0.982579   
3  Linear Regression (Nonlinear Data)  8570.669258  25115.668004  0.837569   
4      Decision Tree (Nonlinear Data)     0.000000   1834.269991  1.000000   
5      Random Forest (Nonlinear Data)   115.496110    645.121671  0.997811   
6     Linear Regression (Random Data)   815.420666    572.523459  0.000059   
7         Decision Tree (Random Data)     0.000000   1777.583603  1.000000   
8         Random Forest (Random Data)   153.770158   1311.497829  0.811433   

    Test R2  
0  0.892833  
1  0.776757  
2  0.816132  
3  0.797303  
4  0.985196  
5  0.994794  
6 -0.145789  
7 -2.557473  
8 -1.624697  
각 모델을 선형, 비선형 2종류 데이터에 구분하고 학습과 평가 데이터를 구성 이후에 MSE(잔차평균의제곱)을 구한 값으로 데이터 산포도를 확인 이후 traintR2 testR2로 각 모델 중 성능이 우수한 모델 확인 -> 결과를 보면 의사결정나무가 선형,비선형에서 모두 높은 성능을 보임,

import matplotlib.pyplot as plt

# 데이터 시각화를 위한 함수 정의
def plot_data(X, y, title):
    plt.scatter(X, y, color='blue', alpha=0.5)
    plt.title(title)
    plt.xlabel("X")
    plt.ylabel("y")
    plt.show()

# 선형 데이터 시각화
plot_data(X_linear, y_linear, "Linear Data")

# 비선형 데이터 시각화
plot_data(X_nonlinear, y_nonlinear, "Nonlinear Data")

# 랜덤 데이터 시각화
plot_data(X_random, y_random, "Random Data")



여태까지 시각화는 하나씩 작성했지만 함수로 관리하는게 좋아보이네
#시각화 함수
def plot_data_with_predictions(X, y ,model, title):
    plt.scatter(X,y, color='blue',alpha=0.5, label='True ')
    X_range = np.linspace(X.min(),X.max(), 100).reshape(-1,1)
    y_pred = model.predict(X_range)
    plt.plot(X_range, y_pred, color='red',label='Prediciton Line')
    plt.title(title)
    plt.xlabel('x')
    plt.ylabel('y')
    plt.legend()
    plt.show()
X = X_test_linear
y = y_test_linear
model = lin_reg

plt.scatter(X,y, color="blue", alpha=0.5)
X_range =np.linspace(X.min(),X.max(), 100).reshape(-1,1)
y_pred = model.predict(X_range)
c:\Users\chltm\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\base.py:493: UserWarning: X does not have valid feature names, but LinearRegression was fitted with feature names
  warnings.warn(

title = "선형회귀"
plt.plot(X_range, y_pred, color='red',label='Prediciton Line')
plt.title(title)
plt.xlabel('x')
plt.ylabel('y')
plt.legend()
plt.show()
C:\Users\chltm\AppData\Roaming\Python\Python312\site-packages\IPython\core\pylabtools.py:152: UserWarning: Glyph 49440 (\N{HANGUL SYLLABLE SEON}) missing from current font.
  fig.canvas.print_figure(bytes_io, **kw)
C:\Users\chltm\AppData\Roaming\Python\Python312\site-packages\IPython\core\pylabtools.py:152: UserWarning: Glyph 54805 (\N{HANGUL SYLLABLE HYEONG}) missing from current font.
  fig.canvas.print_figure(bytes_io, **kw)
C:\Users\chltm\AppData\Roaming\Python\Python312\site-packages\IPython\core\pylabtools.py:152: UserWarning: Glyph 54924 (\N{HANGUL SYLLABLE HOE}) missing from current font.
  fig.canvas.print_figure(bytes_io, **kw)
C:\Users\chltm\AppData\Roaming\Python\Python312\site-packages\IPython\core\pylabtools.py:152: UserWarning: Glyph 44480 (\N{HANGUL SYLLABLE GWI}) missing from current font.
  fig.canvas.print_figure(bytes_io, **kw)

reshape(-1,1)에서 -1의 의미?

reshape은 배열의 차원을 바꿔주는 함수
reshape(2,1) 행, 열은 2행 1열로 데이터를 바꾸라는의미
reshape (-1,1) -1 행은 열에 따라 산정한 후에 알아서 행 사이즈를 정하라는 뜻!
결론. -1은 남은 배열의 길이에 맞춰 행을 산정하라는 의미이다~
X_range
array([[0.64232171],
       [0.73590651],
       [0.82949131],
       [0.92307611],
       [1.01666091],
       [1.11024571],
       [1.20383051],
       [1.29741531],
       [1.39100011],
       [1.48458491],
       [1.57816971],
       [1.67175451],
       [1.76533931],
       [1.85892411],
       [1.9525089 ],
       [2.0460937 ],
       [2.1396785 ],
       [2.2332633 ],
       [2.3268481 ],
       [2.4204329 ],
       [2.5140177 ],
       [2.6076025 ],
       [2.7011873 ],
       [2.7947721 ],
       [2.8883569 ],
       [2.9819417 ],
       [3.0755265 ],
       [3.1691113 ],
       [3.2626961 ],
       [3.35628089],
       [3.44986569],
       [3.54345049],
       [3.63703529],
       [3.73062009],
       [3.82420489],
       [3.91778969],
       [4.01137449],
       [4.10495929],
       [4.19854409],
       [4.29212889],
       [4.38571369],
       [4.47929849],
       [4.57288329],
       [4.66646809],
       [4.76005288],
       [4.85363768],
       [4.94722248],
       [5.04080728],
       [5.13439208],
       [5.22797688],
       [5.32156168],
       [5.41514648],
       [5.50873128],
       [5.60231608],
       [5.69590088],
       [5.78948568],
       [5.88307048],
       [5.97665528],
       [6.07024008],
       [6.16382487],
       [6.25740967],
       [6.35099447],
       [6.44457927],
       [6.53816407],
       [6.63174887],
       [6.72533367],
       [6.81891847],
       [6.91250327],
       [7.00608807],
       [7.09967287],
       [7.19325767],
       [7.28684247],
       [7.38042727],
       [7.47401207],
       [7.56759686],
       [7.66118166],
       [7.75476646],
       [7.84835126],
       [7.94193606],
       [8.03552086],
       [8.12910566],
       [8.22269046],
       [8.31627526],
       [8.40986006],
       [8.50344486],
       [8.59702966],
       [8.69061446],
       [8.78419926],
       [8.87778406],
       [8.97136885],
       [9.06495365],
       [9.15853845],
       [9.25212325],
       [9.34570805],
       [9.43929285],
       [9.53287765],
       [9.62646245],
       [9.72004725],
       [9.81363205],
       [9.90721685]])
X_range.reshape(-1,1)
# x축의 범위는 linespace로 하면 되는구나!
array([[0.64232171],
       [0.73590651],
       [0.82949131],
       [0.92307611],
       [1.01666091],
       [1.11024571],
       [1.20383051],
       [1.29741531],
       [1.39100011],
       [1.48458491],
       [1.57816971],
       [1.67175451],
       [1.76533931],
       [1.85892411],
       [1.9525089 ],
       [2.0460937 ],
       [2.1396785 ],
       [2.2332633 ],
       [2.3268481 ],
       [2.4204329 ],
       [2.5140177 ],
       [2.6076025 ],
       [2.7011873 ],
       [2.7947721 ],
       [2.8883569 ],
       [2.9819417 ],
       [3.0755265 ],
       [3.1691113 ],
       [3.2626961 ],
       [3.35628089],
       [3.44986569],
       [3.54345049],
       [3.63703529],
       [3.73062009],
       [3.82420489],
       [3.91778969],
       [4.01137449],
       [4.10495929],
       [4.19854409],
       [4.29212889],
       [4.38571369],
       [4.47929849],
       [4.57288329],
       [4.66646809],
       [4.76005288],
       [4.85363768],
       [4.94722248],
       [5.04080728],
       [5.13439208],
       [5.22797688],
       [5.32156168],
       [5.41514648],
       [5.50873128],
       [5.60231608],
       [5.69590088],
       [5.78948568],
       [5.88307048],
       [5.97665528],
       [6.07024008],
       [6.16382487],
       [6.25740967],
       [6.35099447],
       [6.44457927],
       [6.53816407],
       [6.63174887],
       [6.72533367],
       [6.81891847],
       [6.91250327],
       [7.00608807],
       [7.09967287],
       [7.19325767],
       [7.28684247],
       [7.38042727],
       [7.47401207],
       [7.56759686],
       [7.66118166],
       [7.75476646],
       [7.84835126],
       [7.94193606],
       [8.03552086],
       [8.12910566],
       [8.22269046],
       [8.31627526],
       [8.40986006],
       [8.50344486],
       [8.59702966],
       [8.69061446],
       [8.78419926],
       [8.87778406],
       [8.97136885],
       [9.06495365],
       [9.15853845],
       [9.25212325],
       [9.34570805],
       [9.43929285],
       [9.53287765],
       [9.62646245],
       [9.72004725],
       [9.81363205],
       [9.90721685]])
def train_and_plot(model, X_train, y_train, X_test, y_test, label):
    model.fit(X_train, y_train)
    plot_data_with_predictions(X_test, y_test, model, label)
선형 데이터 시각화
train_and_plot(lin_reg,X_train_linear,y_train_linear, X_test_linear, y_test_linear, 'Linear Regression')
train_and_plot(tree_reg,X_train_linear,y_train_linear, X_test_linear, y_test_linear, 'Tree Regerssion')
train_and_plot(forest_reg,X_train_linear,y_train_linear, X_test_linear, y_test_linear, 'RF Regerssion')
c:\Users\chltm\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\base.py:493: UserWarning: X does not have valid feature names, but LinearRegression was fitted with feature names
  warnings.warn(

c:\Users\chltm\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\base.py:493: UserWarning: X does not have valid feature names, but DecisionTreeRegressor was fitted with feature names
  warnings.warn(

c:\Users\chltm\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\base.py:493: UserWarning: X does not have valid feature names, but RandomForestRegressor was fitted with feature names
  warnings.warn(

비선형 데이터 시각화
train_and_plot(lin_reg,X_train_nonlinear,y_train_nonlinear, X_test_nonlinear, y_test_nonlinear, 'Linear Regression')
train_and_plot(tree_reg,X_train_nonlinear,y_train_nonlinear, X_test_nonlinear, y_test_nonlinear, 'Tree Regerssion')
train_and_plot(forest_reg,X_train_nonlinear,y_train_nonlinear, X_test_nonlinear, y_test_nonlinear, 'RF Regerssion')
c:\Users\chltm\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\base.py:493: UserWarning: X does not have valid feature names, but LinearRegression was fitted with feature names
  warnings.warn(

c:\Users\chltm\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\base.py:493: UserWarning: X does not have valid feature names, but DecisionTreeRegressor was fitted with feature names
  warnings.warn(

c:\Users\chltm\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\base.py:493: UserWarning: X does not have valid feature names, but RandomForestRegressor was fitted with feature names
  warnings.warn(

비선형+선형 데이터 시각화
train_and_plot(lin_reg,X_train_random,y_train_random, X_test_random, y_test_random, 'Linear Regression')
train_and_plot(tree_reg,X_train_random,y_train_random, X_test_random, y_test_random, 'Tree Regerssion')
train_and_plot(forest_reg,X_train_random,y_train_random, X_test_random, y_test_random, 'RF Regerssion')
c:\Users\chltm\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\base.py:493: UserWarning: X does not have valid feature names, but LinearRegression was fitted with feature names
  warnings.warn(

c:\Users\chltm\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\base.py:493: UserWarning: X does not have valid feature names, but DecisionTreeRegressor was fitted with feature names
  warnings.warn(

c:\Users\chltm\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\base.py:493: UserWarning: X does not have valid feature names, but RandomForestRegressor was fitted with feature names
  warnings.warn(

분류모델
선형적으로 구분이 가능한 데이터셋
비선형적으로 구분이 가능한 데이터셋
랜덤한 데이터셋 ( 구분이 어려운 데이터셋 )
로지스틱, DT, RF, SVC
어떤 식으로 모델이 분류하는지?
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import make_blobs, make_moons, make_classification
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
# 1. 데이터 생성
# 선형적으로 구분 가능한 데이터 (선형적으로 잘 나뉘는 데이터)
X_linear, y_linear = make_blobs(n_samples=200, centers=2, n_features=2, random_state=42, cluster_std=1.5)

# 비선형적으로 구분 가능한 데이터 (moons 데이터셋: 비선형적인 데이터 생성)
X_nonlinear, y_nonlinear = make_moons(n_samples=200, noise=0.3, random_state=42)

# 랜덤 데이터 (구분이 어려운 랜덤 데이터)
X_random, y_random = make_classification(n_samples=200, n_features=2, n_informative=2, n_redundant=0, n_clusters_per_class=1, random_state=42, flip_y=0.5)
# 2. 훈련 및 테스트 데이터로 분리
X_train_linear, X_test_linear, y_train_linear, y_test_linear = train_test_split(X_linear, y_linear, test_size=0.2, random_state=42)
X_train_nonlinear, X_test_nonlinear, y_train_nonlinear, y_test_nonlinear = train_test_split(X_nonlinear, y_nonlinear, test_size=0.2, random_state=42)
X_train_random, X_test_random, y_train_random, y_test_random = train_test_split(X_random, y_random, test_size=0.2, random_state=42)

# 3. 모델 초기화
log_reg = LogisticRegression()
tree_clf = DecisionTreeClassifier(random_state=42)
forest_clf = RandomForestClassifier(n_estimators=100, random_state=42)
svm_clf = SVC(kernel='rbf', random_state=42)
# 성능 평가 및 결정 경계 시각화 함수 정의
def evaluate_and_plot_decision_boundary(model, X_train, X_test, y_train, y_test, title):
    model.fit(X_train, y_train)
    y_pred_test = model.predict(X_test)

    # 정확도 계산
    acc = accuracy_score(y_test, y_pred_test)
    print(f"{title} Test Accuracy: {acc}")

    # 분류 리포트 출력
    print("Classification Report:")
    print(classification_report(y_test, y_pred_test))

    # 결정 경계 시각화
    plot_decision_boundary(model, X_train, y_train, title)

# 결정 경계를 시각화하는 함수
def plot_decision_boundary(model, X, y, title):
    x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.1),
                         np.arange(y_min, y_max, 0.1))
    
    Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    
    plt.contourf(xx, yy, Z, alpha=0.8, cmap=plt.cm.coolwarm)
    plt.scatter(X[:, 0], X[:, 1], c=y, s=40, edgecolor='k', marker='o', cmap=plt.cm.coolwarm)
    plt.title(title)
    plt.xlabel("Feature 1")
    plt.ylabel("Feature 2")
    plt.show()

# 데이터 시각화 함수
def plot_data(X, y, title):
    plt.figure(figsize=(8, 6))
    sns.scatterplot(x=X[:, 0], y=X[:, 1], hue=y, palette="coolwarm", alpha=0.8)
    plt.title(title)
    plt.show()

# 각 데이터 시각화
plot_data(X_linear, y_linear, "Linear Data")
plot_data(X_nonlinear, y_nonlinear, "Nonlinear Data")
plot_data(X_random, y_random, "Random Data")

# 선형 데이터
evaluate_and_plot_decision_boundary(log_reg, X_train_linear, X_test_linear, y_train_linear, y_test_linear, "Logistic Regression (Linear Data)")
evaluate_and_plot_decision_boundary(tree_clf, X_train_linear, X_test_linear, y_train_linear, y_test_linear, "Decision Tree (Linear Data)")
evaluate_and_plot_decision_boundary(forest_clf, X_train_linear, X_test_linear, y_train_linear, y_test_linear, "Random Forest (Linear Data)")
evaluate_and_plot_decision_boundary(svm_clf, X_train_linear, X_test_linear, y_train_linear, y_test_linear, "SVM (Linear Data)")

# 비선형 데이터
evaluate_and_plot_decision_boundary(log_reg, X_train_nonlinear, X_test_nonlinear, y_train_nonlinear, y_test_nonlinear, "Logistic Regression (Nonlinear Data)")
evaluate_and_plot_decision_boundary(tree_clf, X_train_nonlinear, X_test_nonlinear, y_train_nonlinear, y_test_nonlinear, "Decision Tree (Nonlinear Data)")
evaluate_and_plot_decision_boundary(forest_clf, X_train_nonlinear, X_test_nonlinear, y_train_nonlinear, y_test_nonlinear, "Random Forest (Nonlinear Data)")
evaluate_and_plot_decision_boundary(svm_clf, X_train_nonlinear, X_test_nonlinear, y_train_nonlinear, y_test_nonlinear, "SVM (Nonlinear Data)")

# 랜덤 데이터
evaluate_and_plot_decision_boundary(log_reg, X_train_random, X_test_random, y_train_random, y_test_random, "Logistic Regression (Random Data)")
evaluate_and_plot_decision_boundary(tree_clf, X_train_random, X_test_random, y_train_random, y_test_random, "Decision Tree (Random Data)")
evaluate_and_plot_decision_boundary(forest_clf, X_train_random, X_test_random, y_train_random, y_test_random, "Random Forest (Random Data)")
evaluate_and_plot_decision_boundary(svm_clf, X_train_random, X_test_random, y_train_random, y_test_random, "SVM (Random Data)")



Logistic Regression (Linear Data) Test Accuracy: 1.0
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        23
           1       1.00      1.00      1.00        17

    accuracy                           1.00        40
   macro avg       1.00      1.00      1.00        40
weighted avg       1.00      1.00      1.00        40


Decision Tree (Linear Data) Test Accuracy: 1.0
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        23
           1       1.00      1.00      1.00        17

    accuracy                           1.00        40
   macro avg       1.00      1.00      1.00        40
weighted avg       1.00      1.00      1.00        40


Random Forest (Linear Data) Test Accuracy: 1.0
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        23
           1       1.00      1.00      1.00        17

    accuracy                           1.00        40
   macro avg       1.00      1.00      1.00        40
weighted avg       1.00      1.00      1.00        40


SVM (Linear Data) Test Accuracy: 1.0
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        23
           1       1.00      1.00      1.00        17

    accuracy                           1.00        40
   macro avg       1.00      1.00      1.00        40
weighted avg       1.00      1.00      1.00        40


Logistic Regression (Nonlinear Data) Test Accuracy: 0.85
Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.91      0.87        22
           1       0.88      0.78      0.82        18

    accuracy                           0.85        40
   macro avg       0.85      0.84      0.85        40
weighted avg       0.85      0.85      0.85        40


Decision Tree (Nonlinear Data) Test Accuracy: 0.85
Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.95      0.88        22
           1       0.93      0.72      0.81        18

    accuracy                           0.85        40
   macro avg       0.87      0.84      0.84        40
weighted avg       0.86      0.85      0.85        40


Random Forest (Nonlinear Data) Test Accuracy: 0.9
Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.95      0.91        22
           1       0.94      0.83      0.88        18

    accuracy                           0.90        40
   macro avg       0.91      0.89      0.90        40
weighted avg       0.90      0.90      0.90        40


SVM (Nonlinear Data) Test Accuracy: 0.875
Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.86      0.88        22
           1       0.84      0.89      0.86        18

    accuracy                           0.88        40
   macro avg       0.87      0.88      0.87        40
weighted avg       0.88      0.88      0.88        40


Logistic Regression (Random Data) Test Accuracy: 0.7
Classification Report:
              precision    recall  f1-score   support

           0       0.68      0.81      0.74        21
           1       0.73      0.58      0.65        19

    accuracy                           0.70        40
   macro avg       0.71      0.69      0.69        40
weighted avg       0.71      0.70      0.70        40


Decision Tree (Random Data) Test Accuracy: 0.45
Classification Report:
              precision    recall  f1-score   support

           0       0.47      0.43      0.45        21
           1       0.43      0.47      0.45        19

    accuracy                           0.45        40
   macro avg       0.45      0.45      0.45        40
weighted avg       0.45      0.45      0.45        40


Random Forest (Random Data) Test Accuracy: 0.625
Classification Report:
              precision    recall  f1-score   support

           0       0.65      0.62      0.63        21
           1       0.60      0.63      0.62        19

    accuracy                           0.62        40
   macro avg       0.62      0.63      0.62        40
weighted avg       0.63      0.62      0.63        40


SVM (Random Data) Test Accuracy: 0.675
Classification Report:
              precision    recall  f1-score   support

           0       0.67      0.76      0.71        21
           1       0.69      0.58      0.63        19

    accuracy                           0.68        40
   macro avg       0.68      0.67      0.67        40
weighted avg       0.68      0.68      0.67        40


정리
1. DR, RF, SVM 모델은 각 데이터의 타입과 형태에 따라 성능이 다르다!
2. 당연히 더 세밀하게 쪼개는 모델일수록 과적합 문제를 해결해야하며 데이터의 분포에 따라 내가 모델을 적합하게 골라야한다
3. 데이터의 노이즈가 섞여 있더라도 그 노이즈를 더 잘 구분하기 위해선 현재 지닌 데이터의 특징을 인식해야한다. 구분하는 피쳐값을 크게 해본다든가, 데이터에 가중치를 둬 극적인 상태로 바꾼다던가 하는 방법으로 말이다!

'기계학습(Machine Learning)' 카테고리의 다른 글

[B.D.A] Support Vector Machine SVM 이론  (2) 2024.11.09