Team-16-Machine-Learning-Project/phase3-preprocessing.py at main · derekogorry/Team-16-Machine-Learning-Project · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
# STEP 3: PREPROCESSING & FEATURE ENGINEERING
# At the end, we'll save processed data files that Steps 4, 5, and 6 load.

#Import Libraries
import pandas as pd
import numpy as np
import joblib

# train_test_split: randomly splits data into training and testing portions
from sklearn.model_selection import train_test_split

# StandardScaler: rescales numbers so they all have mean=0 and std=1
from sklearn.preprocessing import StandardScaler, OrdinalEncoder

#SimpleImputer: fills in missing values with chosen strategy
from sklearn.impute import SimpleImputer

# Pipeline: chains multiple steps (impute to scale) into one reusable object
from sklearn.pipeline import Pipeline

# ColumnTransformer: applies different pipelines to different columns
# (e.g., numeric columns get one treatment, text columns get another)
from sklearn.compose import ColumnTransformer

# LOAD THE RAW DATASETS FROM STEP 1
datasets = joblib.load("raw_datasets.pkl")
ames_df   = datasets["ames"]
kaggle_df = datasets["kaggle"]

print("Raw datasets loaded.")

# PART 1: PREPARE THE AMES DATASET

# 1a. Separate target from features
y = np.log1p(ames_df["SalePrice"])        # target: log(SalePrice)
X = ames_df.drop(columns=["SalePrice"])   # features: everything except SalePrice

print(f"\nAmes feature matrix shape: {X.shape}")   # (rows, columns)
print(f"Target vector shape:        {y.shape}")

# 1b. Identify numeric vs categorical columns to treat them differently in preprocessing
numeric_cols     = X.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = X.select_dtypes(include=["object", "category"]).columns.tolist()

print(f"\nNumeric features:     {len(numeric_cols)}")
print(f"Categorical features: {len(categorical_cols)}")

# 1c. Train/test split
#   TRAINING SET (80%): the model learns from this data
#   TEST SET    (20%): we hide this from the model during training,
#                      then use it to measure real-world performance
# random_state=42 ensures the split is the same every time you script's run
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,      # 20% of data goes to the test set
    random_state=42     # "seed" for reproducibility
)

print(f"\nTrain size: {X_train.shape[0]} houses")
print(f"Test size:  {X_test.shape[0]} houses")

# 1d. Build the preprocessing pipeline
numeric_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler",  StandardScaler()),
])

# CATEGORICAL PIPELINE:
# — OrdinalEncoder: converts text to numbers
#       e.g., ["Good", "Average", "Poor"] → [2, 1, 0]
#       handle_unknown="use_encoded_value" with unknown_value=-1 means
#       if we see a category at test time we never saw in training,
#       it gets encoded as -1 instead of crashing

categorical_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OrdinalEncoder(
        handle_unknown="use_encoded_value",
        unknown_value=-1
    )),
])

# COLUMN TRANSFORMER: applies the right pipeline to the right columns
# "num" → applies numeric_pipeline to all numeric columns
# "cat" → applies categorical_pipeline to all categorical columns
preprocessor = ColumnTransformer([
    ("num", numeric_pipeline,     numeric_cols),
    ("cat", categorical_pipeline, categorical_cols),
])

# 1e. Fit and transform
# *Note to my partners: fit_transform on TRAIN only, transform (never fit) on TEST
print("\nFitting preprocessor on training data and transforming both sets...")
X_train_processed = preprocessor.fit_transform(X_train)  # learn + apply
X_test_processed  = preprocessor.transform(X_test)        # apply only (don't re-learn)

print(f"Processed training set shape: {X_train_processed.shape}")
print(f"Processed test set shape:     {X_test_processed.shape}")

# PART 2: PREPARE THE KAGGLE DATASET
if kaggle_df is not None:
    print("\n── Preparing Kaggle dataset for generalization test ──")

    # Load the feature mapping we saved alr
    FEATURE_MAPPING = joblib.load("feature_mapping.pkl")
    # FEATURE_MAPPING = {"area": "GrLivArea", "bedrooms": "BedroomAbvGr", ...}

    # 2a. Normalize Kaggle prices
    # log-transform Kaggle so it's similar for visualization.
    #(Evaluate using % error rather than dollar error b/c of different currencies)
    kaggle_df = kaggle_df.copy()
    kaggle_df["log_price"] = np.log1p(kaggle_df["price"])

    #2b. Extract/rename shared features to match Ames names
    # preprocessor we built for Ames can be partially reused for this
    kaggle_shared = kaggle_df[list(FEATURE_MAPPING.keys())].copy()
    kaggle_shared = kaggle_shared.rename(columns=FEATURE_MAPPING)
    # Now kaggle_shared has columns: GrLivArea, BedroomAbvGr, FullBath, GarageCars

    #2c. Scale the Kaggle features using fresh scaler.
    # We fit a new scaler on the Kaggle data (not Ames)
    kaggle_scaler = StandardScaler()
    kaggle_processed = kaggle_scaler.fit_transform(kaggle_shared.fillna(0))
    # .fillna(0) handles any rare missing values

    kaggle_y = kaggle_df["log_price"].values  # target: log(price)

    print(f"Kaggle processed shape: {kaggle_processed.shape}")
    print(f"Shared features used:   {list(FEATURE_MAPPING.values())}")

    # Save Kaggle-specific items
    joblib.dump(
        {
            "kaggle_X": kaggle_processed,
            "kaggle_y": kaggle_y,
            "kaggle_scaler": kaggle_scaler,
            "feature_mapping": FEATURE_MAPPING,
            "kaggle_raw": kaggle_df,
        },
        "kaggle_processed.pkl"
    )
    print("Saved: kaggle_processed.pkl")
else:
    print("\nKaggle dataset not loaded — skipping Kaggle preprocessing.")

# PART 3: SAVE EVERYTHING FOR STEPS 4, 5, AND 6

# Save the preprocessor so p6 can use it to prep new data
joblib.dump(preprocessor, "preprocessor.pkl")

# Save the processed Ames train/test sets & metadata
joblib.dump(
    {
        "X_train": X_train_processed,
        "X_test":  X_test_processed,
        "y_train": y_train,
        "y_test":  y_test,
        "numeric_cols":     numeric_cols,
        "categorical_cols": categorical_cols,
    },
    "train_test_data.pkl"
)

print("\n── Saved: preprocessor.pkl, train_test_data.pkl ──")
print("Step 3 complete! Proceed to step4_train_models.py")