# %% Imports + warnings  ############################################
# ==== IMPORTING LIBRARIES ====

# Data manipulation and analysis
import numpy as np
import pandas as pd
from scipy import stats

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.ticker import StrMethodFormatter

# Machine learning
from sklearn.model_selection import train_test_split, cross_val_score, KFold, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, f_regression
import xgboost as xgb

# Warnings and display settings
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
plt.style.use('seaborn-v0_8-whitegrid')
# 📊 Plot aesthetics
sns.set_palette("colorblind")          # Color‑blind safe palette
plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

# %% Load CSV & head  ################################################
# ==== LOADING THE DATASET ====
df = pd.read_csv("house_prices.csv")

print(f"Dataset Shape: {df.shape}")
print(f"Number of Features: {df.shape[1]}")
print(f"Number of Samples: {df.shape[0]}")

print("\nPreview of the dataset:")
display(df.head())

Dataset Shape: (1460, 81)
Number of Features: 81
Number of Samples: 1460

Preview of the dataset:

# %%Data‑type / missing summary  ###################################
# ==== DATA PREPROCESSING ====
print("\n==== DATA STRUCTURE ====")
print(f"Data types:\n{df.dtypes.value_counts()}")
print(f"\nMissing values summary:\n{df.isnull().sum().sum()} total missing values")



numeric_features = df.select_dtypes(include=['int64', 'float64']).columns
categorical_features = df.select_dtypes(include=['object']).columns
print(f"\nNumeric features: {len(numeric_features)}")
print(f"Categorical features: {len(categorical_features)}")

==== DATA STRUCTURE ====
Data types:
object     43
int64      35
float64     3
Name: count, dtype: int64

Missing values summary:
7829 total missing values

Numeric features: 38
Categorical features: 43

# %% Missing‑value logic  ###########################################
print("\n==== HANDLING MISSING VALUES ====")
missing_vals = df.isnull().mean() * 100
cols_to_drop = missing_vals[missing_vals > 60].index.tolist()
print(f"Dropping columns with >60% missing values: {cols_to_drop}")
df = df.drop(columns=cols_to_drop, errors='ignore')

for col in df.select_dtypes(include=['int64', 'float64']).columns:
    if df[col].isnull().sum() > 0:
        df[col] = df[col].fillna(df[col].median())

for col in df.select_dtypes(include=['object']).columns:
    missing_pct = df[col].isnull().mean() * 100
    if missing_pct > 0 and missing_pct < 50:
        df[col] = df[col].fillna(df[col].mode()[0])
    elif missing_pct >= 50:
        df[col] = df[col].fillna('None')

print(f"Remaining missing values: {df.isnull().sum().sum()}")

print("\n==== CONVERTING DATA TYPES ====")
for col in df.select_dtypes(include=['object']).columns:
    df[col] = df[col].astype('category')

ordinal_features = ['ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 
                   'HeatingQC', 'KitchenQual', 'FireplaceQu', 'GarageQual', 
                   'GarageCond', 'PoolQC', 'Fence']
ordinal_map = {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'None': 0, 'NA': 0}
for col in ordinal_features:
    if col in df.columns:
        df[col] = df[col].astype(str).map(ordinal_map).fillna(0).astype(int)

==== HANDLING MISSING VALUES ====
Dropping columns with >60% missing values: ['Alley', 'PoolQC', 'Fence', 'MiscFeature']
Remaining missing values: 0

==== CONVERTING DATA TYPES ====

# %% Outlier capper  #################################################
print("\n==== HANDLING OUTLIERS ====")
def handle_outliers(df, column, method='cap'):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = ((df[column] < lower_bound) | (df[column] > upper_bound)).sum()
    print(f"Outliers in {column}: {outliers}")
    if method == 'cap':
        df[column] = np.where(df[column] > upper_bound, upper_bound, df[column])
        df[column] = np.where(df[column] < lower_bound, lower_bound, df[column])
    return df

if 'SalePrice' in df.columns:
    df = handle_outliers(df, 'SalePrice')

important_numeric = ['LotArea', 'GrLivArea', 'TotalBsmtSF', '1stFlrSF']
for col in important_numeric:
    if col in df.columns:
        df = handle_outliers(df, col)

==== HANDLING OUTLIERS ====
Outliers in SalePrice: 61
Outliers in LotArea: 69
Outliers in GrLivArea: 31
Outliers in TotalBsmtSF: 61
Outliers in 1stFlrSF: 20

# %% Feature engineering  ###########################################
print("\n==== FEATURE ENGINEERING ====")
if all(col in df.columns for col in ['TotalBsmtSF', '1stFlrSF', '2ndFlrSF']):
    df['TotalSF'] = df['TotalBsmtSF'] + df['1stFlrSF'] + df['2ndFlrSF']
    print("Created TotalSF feature")

if 'YearBuilt' in df.columns:
    current_year = 2025
    df['HouseAge'] = current_year - df['YearBuilt']
    print("Created HouseAge feature")

if all(col in df.columns for col in ['YearBuilt', 'YearRemodAdd']):
    df['YearsSinceRenovation'] = df['YearRemodAdd'] - df['YearBuilt']
    df['YearsSinceRenovation'] = df['YearsSinceRenovation'].apply(lambda x: 0 if x < 0 else x)
    print("Created YearsSinceRenovation feature")

bathroom_cols = [col for col in df.columns if 'Bath' in col]
if bathroom_cols:
    df['TotalBathrooms'] = df[bathroom_cols].sum(axis=1)
    print("Created TotalBathrooms feature")

if 'PoolArea' in df.columns:
    df['HasPool'] = (df['PoolArea'] > 0).astype(int)
    print("Created HasPool feature")
if 'GarageArea' in df.columns:
    df['HasGarage'] = (df['GarageArea'] > 0).astype(int)
    print("Created HasGarage feature")
if 'TotalBsmtSF' in df.columns:
    df['HasBasement'] = (df['TotalBsmtSF'] > 0).astype(int)
    print("Created HasBasement feature")

==== FEATURE ENGINEERING ====
Created TotalSF feature
Created HouseAge feature
Created YearsSinceRenovation feature
Created TotalBathrooms feature
Created HasPool feature
Created HasGarage feature
Created HasBasement feature

# %% Target distribution + log  #####################################
print("\n==== TARGET VARIABLE DISTRIBUTION ====")
if 'SalePrice' in df.columns:
    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    sns.histplot(df['SalePrice'], kde=True)
    plt.title('Distribution of SalePrice')
    plt.xlabel('Price ($)')
    plt.subplot(1, 2, 2)
    stats.probplot(df['SalePrice'], dist="norm", plot=plt)
    plt.title('Q-Q Plot of SalePrice')
    plt.tight_layout()
    plt.show()

    print(f"SalePrice Statistics:\n{df['SalePrice'].describe()}")
    print(f"Skewness: {df['SalePrice'].skew():.2f}")
    print(f"Kurtosis: {df['SalePrice'].kurt():.2f}")
    if df['SalePrice'].skew() > 0.5:
        print("SalePrice is positively skewed. Log transformation recommended.")
        df['LogSalePrice'] = np.log1p(df['SalePrice'])
        plt.figure(figsize=(12, 5))
        plt.subplot(1, 2, 1)
        sns.histplot(df['LogSalePrice'], kde=True)
        plt.title('Distribution of Log-Transformed SalePrice')
        plt.subplot(1, 2, 2)
        stats.probplot(df['LogSalePrice'], dist="norm", plot=plt)
        plt.title('Q-Q Plot of Log-Transformed SalePrice')
        plt.tight_layout()
        plt.show()
        print(f"Log-SalePrice Skewness: {df['LogSalePrice'].skew():.2f}")

==== TARGET VARIABLE DISTRIBUTION ====

SalePrice Statistics:
count      1460.000000
mean     177331.526370
std       67205.835915
min       34900.000000
25%      129975.000000
50%      163000.000000
75%      214000.000000
max      340037.500000
Name: SalePrice, dtype: float64
Skewness: 0.80
Kurtosis: 0.09
SalePrice is positively skewed. Log transformation recommended.

Log-SalePrice Skewness: -0.19

# %% Correlation heat‑map + scatter  ################################
print("\n==== CORRELATION ANALYSIS ====")
if 'SalePrice' in df.columns:
    numeric_df = df.select_dtypes(include=['int64', 'float64'])
    correlations = numeric_df.corr()['SalePrice'].sort_values(ascending=False)
    print("Top 15 Positive Correlations:")
    print(correlations.head(15))
    print("\nTop 15 Negative Correlations:")
    print(correlations.tail(15))

    plt.figure(figsize=(14, 10))
    top_corr_features = correlations.index[:10].tolist() + correlations.index[-5:].tolist()
    if 'SalePrice' not in top_corr_features:
        top_corr_features.append('SalePrice')
    corr_matrix = numeric_df[top_corr_features].corr()
    mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
    sns.heatmap(corr_matrix, mask=mask, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
    plt.title('Correlation Heatmap of Top Features')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.show()

    plt.figure(figsize=(15, 10))
    top_features = correlations.index[1:7]
    for i, feature in enumerate(top_features):
        plt.subplot(2, 3, i+1)
        plt.scatter(df[feature], df['SalePrice'], alpha=0.5)
        plt.title(f'SalePrice vs {feature}')
        plt.xlabel(feature)
        plt.ylabel('SalePrice')
    plt.tight_layout()
    plt.show()

==== CORRELATION ANALYSIS ====
Top 15 Positive Correlations:
SalePrice         1.000000
LogSalePrice      0.969175
TotalSF           0.821488
OverallQual       0.816856
GrLivArea         0.729311
ExterQual         0.699291
KitchenQual       0.679995
GarageCars        0.672293
BsmtQual          0.671338
GarageArea        0.650429
TotalBsmtSF       0.645251
TotalBathrooms    0.637350
1stFlrSF          0.621873
FullBath          0.583994
YearBuilt         0.568918
Name: SalePrice, dtype: float64

Top 15 Negative Correlations:
ExterCond               0.021384
BsmtFinSF2             -0.007869
MiscVal                -0.020362
FireplaceQu            -0.025519
BsmtHalfBath           -0.026997
Id                     -0.027439
YrSold                 -0.031358
LowQualFinSF           -0.040036
OverallCond            -0.075123
MSSubClass             -0.085170
EnclosedPorch          -0.139925
KitchenAbvGr           -0.149355
YearsSinceRenovation   -0.236368
HouseAge               -0.568918
HasBasement                  NaN
Name: SalePrice, dtype: float64

# %% Cell 10 – RL vs RM t‑test  ##############################################
def compare_two_zones_ttest(df, cat_column='MSZoning', val1='RL', val2='RM', target='SalePrice'):
    subset1 = df[df[cat_column] == val1][target].dropna()
    subset2 = df[df[cat_column] == val2][target].dropna()
    if len(subset1) < 2 or len(subset2) < 2:
        print("Not enough data for t‑test.")
        return
    t_stat, p_val = stats.ttest_ind(subset1, subset2, equal_var=False)
    print(f"T‑test {val1} vs {val2}: t={t_stat:.3f}, p={p_val:.5f}")
    print(f"Mean_{val1}={subset1.mean():.2f}  Mean_{val2}={subset2.mean():.2f}")

if 'MSZoning' in df.columns and 'SalePrice' in df.columns:
    compare_two_zones_ttest(df)

T‑test RL vs RM: t=17.311, p=0.00000
Mean_RL=186641.27  Mean_RM=125457.08

# %% Cell 11 – Prep, split, pipelines  #######################################
print("\n==== DATA PREPARATION FOR MODELING ====")
target = 'LogSalePrice' if 'LogSalePrice' in df.columns else 'SalePrice'
y = df[target]
X = df.drop(['SalePrice', 'LogSalePrice'] if 'LogSalePrice' in df.columns else ['SalePrice'], axis=1)

categorical_cols = X.select_dtypes(include=['category', 'object']).columns.tolist()
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

print(f"Target variable: {target}")
print(f"Number of features: {X.shape[1]}")
print(f"Number of categorical features: {len(categorical_cols)}")
print(f"Number of numerical features: {len(numerical_cols)}")

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"Training set size: {X_train.shape[0]}")
print(f"Testing set size: {X_test.shape[0]}")

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

==== DATA PREPARATION FOR MODELING ====
Target variable: LogSalePrice
Number of features: 83
Number of categorical features: 30
Number of numerical features: 53
Training set size: 1168
Testing set size: 292

# %% Cell 12 – Model comparison  #############################################
print("\n==== MODEL TRAINING AND EVALUATION ====")
def evaluate_model(model, X_train, y_train, cv=5):
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', model)
    ])
    cv_rmse = np.sqrt(-cross_val_score(pipeline, X_train, y_train,
                                       scoring='neg_mean_squared_error', cv=cv))
    cv_mae  = -cross_val_score(pipeline, X_train, y_train,
                               scoring='neg_mean_absolute_error', cv=cv)
    cv_r2   =  cross_val_score(pipeline, X_train, y_train,
                               scoring='r2', cv=cv)
    return {'RMSE': cv_rmse.mean(), 'MAE': cv_mae.mean(), 'R2': cv_r2.mean()}

models = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression' : Ridge(),
    'Lasso Regression' : Lasso(),
    'Elastic Net'      : ElasticNet(),
    'Decision Tree'    : DecisionTreeRegressor(),
    'Random Forest'    : RandomForestRegressor(),
    'Gradient Boosting': GradientBoostingRegressor(),
    'XGBoost'          : xgb.XGBRegressor()
}

results = {}
for name, mdl in models.items():
    print(f"Evaluating {name}...")
    results[name] = evaluate_model(mdl, X_train, y_train)
    print(f"  RMSE: {results[name]['RMSE']:.4f}")
    print(f"  MAE : {results[name]['MAE']:.4f}")
    print(f"  R²  : {results[name]['R2']:.4f}")

results_df = pd.DataFrame(results).T.sort_values('RMSE')
display(results_df)
best_model_name = results_df.index[0]
print(f"\nBest model based on RMSE: {best_model_name}")

==== MODEL TRAINING AND EVALUATION ====
Evaluating Linear Regression...
  RMSE: 909742444.5424
  MAE : 94151995.4249
  R²  : -7400164042925058048.0000
Evaluating Ridge Regression...
  RMSE: 0.1176
  MAE : 0.0821
  R²  : 0.8992
Evaluating Lasso Regression...
  RMSE: 0.3720
  MAE : 0.2956
  R²  : -0.0090
Evaluating Elastic Net...
  RMSE: 0.3720
  MAE : 0.2956
  R²  : -0.0090
Evaluating Decision Tree...
  RMSE: 0.1917
  MAE : 0.1405
  R²  : 0.7185
Evaluating Random Forest...

# %% Cell 13 – Grid search  ###################################################
print("\n==== HYPERPARAMETER TUNING ====")
param_grid = {}
if best_model_name == 'Linear Regression':
    param_grid = {'model__fit_intercept': [True, False]}
elif best_model_name == 'Ridge Regression':
    param_grid = {'model__alpha': [0.01, 0.1, 1.0, 10.0, 100.0]}
elif best_model_name == 'Lasso Regression':
    param_grid = {'model__alpha': [0.0001, 0.001, 0.01, 0.1, 1.0]}
elif best_model_name == 'Elastic Net':
    param_grid = {'model__alpha': [0.0001, 0.001, 0.01, 0.1, 1.0],
                  'model__l1_ratio': [0.1, 0.3, 0.5, 0.7, 0.9]}
elif best_model_name == 'Decision Tree':
    param_grid = {'model__max_depth': [None, 10, 20, 30],
                  'model__min_samples_split': [2, 5, 10]}
elif best_model_name == 'Random Forest':
    param_grid = {'model__n_estimators': [100, 200],
                  'model__max_depth': [None, 10, 20, 30],
                  'model__min_samples_split': [2, 5, 10]}
elif best_model_name == 'Gradient Boosting':
    param_grid = {'model__n_estimators': [100, 200],
                  'model__learning_rate': [0.01, 0.1, 0.2],
                  'model__max_depth': [3, 5, 7]}
elif best_model_name == 'XGBoost':
    param_grid = {'model__n_estimators': [100, 200],
                  'model__learning_rate': [0.01, 0.1, 0.2],
                  'model__max_depth': [3, 5, 7],
                  'model__colsample_bytree': [0.7, 0.8, 0.9]}

pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                          ('model', models[best_model_name])])
grid_search = GridSearchCV(pipeline, param_grid, cv=5,
                           scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(X_train, y_train)
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best CV score: {np.sqrt(-grid_search.best_score_):.4f} RMSE")

==== HYPERPARAMETER TUNING ====
Best parameters: {'model__alpha': 10.0}
Best CV score: 0.1174 RMSE

# %% Cell 14 – Test metrics & feature importance  #############################
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

if target == 'LogSalePrice':
    y_test_original = np.expm1(y_test)
    y_pred_original = np.expm1(y_pred)
    rmse = np.sqrt(mean_squared_error(y_test_original, y_pred_original))
    mae  = mean_absolute_error(y_test_original, y_pred_original)
    r2   = r2_score(y_test_original, y_pred_original)
else:
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mae  = mean_absolute_error(y_test, y_pred)
    r2   = r2_score(y_test, y_pred)

print("\nTest Set Performance:")
print(f"RMSE: {rmse:.2f}")
print(f"MAE : {mae:.2f}")
print(f"R²  : {r2:.4f}")

print("\n==== FEATURE IMPORTANCE ANALYSIS ====")
fitted_preprocessor = best_model.named_steps['preprocessor']
feature_names = fitted_preprocessor.get_feature_names_out()
model = best_model.named_steps['model']
if hasattr(model, 'feature_importances_'):
    importances = model.feature_importances_
elif hasattr(model, 'coef_'):
    importances = model.coef_
else:
    importances = None

if importances is not None:
    feature_importance = pd.Series(importances, index=feature_names)
    top_features = feature_importance.abs().sort_values(ascending=False).head(20)
    print("Top 20 most important features:")
    print(top_features)
    plt.figure(figsize=(12, 8))
    top_features.sort_values().plot(kind='barh')
    plt.title(f'Top 20 Feature Importances - {best_model_name}')
    plt.xlabel('Importance')
    plt.tight_layout()
    plt.show()

Test Set Performance:
RMSE: 20672.80
MAE : 14298.36
R²  : 0.9126

==== FEATURE IMPORTANCE ANALYSIS ====
Top 20 most important features:
num__GrLivArea                0.118756
num__TotalBsmtSF              0.102502
num__BsmtUnfSF                0.095646
num__BsmtFinSF1               0.084786
cat__Functional_Typ           0.082183
cat__Neighborhood_Crawfor     0.071718
cat__Exterior1st_BrkFace      0.071036
cat__MSZoning_C (all)         0.063491
num__OverallQual              0.062679
cat__Neighborhood_MeadowV     0.060797
cat__Neighborhood_StoneBr     0.059666
cat__Functional_Maj2          0.058781
cat__Condition2_PosN          0.050348
num__HasPool                  0.047365
cat__SaleCondition_Abnorml    0.045688
num__OverallCond              0.044676
cat__Neighborhood_Edwards     0.044331
cat__Heating_Grav             0.044238
num__PoolArea                 0.043060
cat__SaleCondition_Alloca     0.039220
dtype: float64

# %% Cell 15 – Neighborhood analysis  #########################################
print("\n==== NEIGHBORHOOD EFFECTS ON HOUSE PRICES ====")
if 'SalePrice' in df.columns and 'Neighborhood' in df.columns:
    neighborhood_stats = df.groupby('Neighborhood')['SalePrice'].agg(['mean', 'median', 'std', 'count'])
    neighborhood_stats = neighborhood_stats.sort_values('median', ascending=False)
    print("Neighborhood Price Statistics:")
    print(neighborhood_stats)

    plt.figure(figsize=(14, 8))
    ax = sns.barplot(x=neighborhood_stats.index, y=neighborhood_stats['median'], order=neighborhood_stats.index)
    plt.title('Median House Prices by Neighborhood')
    plt.xlabel('Neighborhood'); plt.ylabel('Median Price ($)')
    plt.xticks(rotation=45, ha='right')
    for p in ax.patches:
        ax.annotate(f'${int(p.get_height()):,}', (p.get_x()+p.get_width()/2., p.get_height()),
                    ha='center', va='bottom', xytext=(0,5), textcoords='offset points')
    plt.tight_layout(); plt.show()

    plt.figure(figsize=(14, 8))
    sns.boxplot(x='Neighborhood', y='SalePrice', data=df, order=neighborhood_stats.index)
    plt.title('Distribution of House Prices within Neighborhoods')
    plt.xlabel('Neighborhood'); plt.ylabel('Price ($)')
    plt.xticks(rotation=45, ha='right'); plt.tight_layout(); plt.show()

    if 'TotalSF' in df.columns:
        df['PricePerSqFt'] = df['SalePrice'] / df['TotalSF']
        price_per_sqft = df.groupby('Neighborhood')['PricePerSqFt'].median().sort_values(ascending=False)
        plt.figure(figsize=(14, 8))
        ax = sns.barplot(x=price_per_sqft.index, y=price_per_sqft.values, order=price_per_sqft.index)
        plt.title('Median Price per Square Foot by Neighborhood')
        plt.xlabel('Neighborhood'); plt.ylabel('Price per Square Foot ($)')
        plt.xticks(rotation=45, ha='right')
        for p in ax.patches:
            ax.annotate(f'${p.get_height():.0f}', (p.get_x()+p.get_width()/2., p.get_height()),
                        ha='center', va='bottom', xytext=(0,5), textcoords='offset points')
        plt.tight_layout(); plt.show()

    if 'OverallQual' in df.columns:
        top_neighborhoods = neighborhood_stats.index[:6]
        plt.figure(figsize=(12, 8))
        for n in top_neighborhoods:
            sns.regplot(x='OverallQual', y='SalePrice', data=df[df['Neighborhood']==n],
                        scatter=True, label=n, scatter_kws={'alpha':0.5})
        plt.title('Quality Premium by Neighborhood')
        plt.xlabel('Overall Quality (1-10 scale)'); plt.ylabel('Sale Price ($)')
        plt.legend(); plt.tight_layout(); plt.show()

        quality_premium = {}
        for n in top_neighborhoods:
            n_data = df[df['Neighborhood']==n]
            if len(n_data)>5:
                Xq = n_data[['OverallQual']]; yq = n_data['SalePrice']
                LinearRegression().fit(Xq, yq)
                quality_premium[n] = LinearRegression().fit(Xq, yq).coef_[0]
        premium_df = pd.DataFrame({'Neighborhood':quality_premium.keys(),
                                   'QualityPremium':quality_premium.values()}).sort_values('QualityPremium', ascending=False)
        print("\nQuality Premium by Neighborhood:")
        for _,row in premium_df.iterrows():
            print(f"{row['Neighborhood']}: ${row['QualityPremium']:,.0f}")

==== NEIGHBORHOOD EFFECTS ON HOUSE PRICES ====
Neighborhood Price Statistics:
                       mean    median           std  count
Neighborhood                                              
NridgHt       288322.577922  315000.0  56679.812109     77
NoRidge       297854.158537  301500.0  39770.107566     41
StoneBr       275573.500000  278000.0  62455.205648     25
Timber        239529.355263  228475.0  59617.753373     38
Somerst       224056.546512  225500.0  52487.727695     86
Veenker       234685.227273  218000.0  64079.276296     11
Crawfor       208419.088235  200624.0  63713.837819     51
ClearCr       212565.428571  200250.0  50231.538993     28
CollgCr       197107.340000  197200.0  48319.637373    150
Blmngtn       194870.882353  191000.0  30393.229219     17
NWAmes        189050.068493  182900.0  37172.218106     73
Gilbert       192380.297468  181000.0  33696.396220     79
SawyerW       186555.796610  179900.0  55651.997820     59
Mitchel       156270.122449  153500.0  36486.625334     49
NPkVill       142694.444444  146000.0   9377.314529      9
NAmes         145825.024444  140000.0  32943.343070    225
SWISU         142591.360000  139500.0  32622.917679     25
Blueste       137500.000000  137500.0  19091.883092      2
Sawyer        136793.135135  135000.0  22345.129157     74
BrkSide       124834.051724  124300.0  40348.689270     58
Edwards       128219.700000  121750.0  43208.616459    100
OldTown       127030.942478  119000.0  45798.853701    113
BrDale        104493.750000  106000.0  14330.176493     16
IDOTRR        100123.783784  103000.0  33376.710117     37
MeadowV        98576.470588   88000.0  23491.049610     17

Quality Premium by Neighborhood:
StoneBr: $48,982
NridgHt: $45,726
Somerst: $40,640
Veenker: $38,700
Timber: $35,316
NoRidge: $25,992

Member	Sections	1–2 sentence summary
Santosh Sureshkumar	A, B, C	Proposed initial idea; handled raw data ingestion & cleaning.
Aarnav Tare	D, E	Designed ML workflow; performed model selection & tuning.
Joshua Vallabhaneni	F, G	Led EDA visualizations, result interpretation, and wrote final tutorial.

	Id	MSSubClass	MSZoning	LotFrontage	LotArea	Street	Alley	LotShape	LandContour	Utilities	LotConfig	LandSlope	Neighborhood	Condition1	Condition2	BldgType	HouseStyle	OverallQual	OverallCond	YearBuilt	YearRemodAdd	RoofStyle	RoofMatl	Exterior1st	Exterior2nd	MasVnrType	MasVnrArea	ExterQual	ExterCond	Foundation	BsmtQual	BsmtCond	BsmtExposure	BsmtFinType1	BsmtFinSF1	BsmtFinType2	BsmtUnfSF	TotalBsmtSF	Heating	HeatingQC	CentralAir	Electrical	1stFlrSF	2ndFlrSF	GrLivArea	BsmtFullBath	BsmtHalfBath	FullBath	HalfBath	BedroomAbvGr	KitchenAbvGr	KitchenQual	TotRmsAbvGrd	Functional	Fireplaces	FireplaceQu	GarageType	GarageYrBlt	GarageFinish	GarageCars	GarageArea	GarageQual	GarageCond	PavedDrive	WoodDeckSF	OpenPorchSF	EnclosedPorch	PoolQC	Fence	MiscFeature	MoSold	YrSold	SaleType	SaleCondition	SalePrice
0	1	60	RL	65.0	8450	Pave	NaN	Reg	Lvl	AllPub	Inside	Gtl	CollgCr	Norm	Norm	1Fam	2Story	7	5	2003	2003	Gable	CompShg	VinylSd	VinylSd	BrkFace	196.0	Gd	TA	PConc	Gd	TA	No	GLQ	706	Unf	150	856	GasA	Ex	Y	SBrkr	856	854	1710	1	0	2	1	3	1	Gd	8	Typ	0	NaN	Attchd	2003.0	RFn	2	548	TA	TA	Y	0	61	0	NaN	NaN	NaN	2	2008	WD	Normal	208500
1	2	20	RL	80.0	9600	Pave	NaN	Reg	Lvl	AllPub	FR2	Gtl	Veenker	Feedr	Norm	1Fam	1Story	6	8	1976	1976	Gable	CompShg	MetalSd	MetalSd	NaN	0.0	TA	TA	CBlock	Gd	TA	Gd	ALQ	978	Unf	284	1262	GasA	Ex	Y	SBrkr	1262	0	1262	0	1	2	0	3	1	TA	6	Typ	1	TA	Attchd	1976.0	RFn	2	460	TA	TA	Y	298	0	0	NaN	NaN	NaN	5	2007	WD	Normal	181500
2	3	60	RL	68.0	11250	Pave	NaN	IR1	Lvl	AllPub	Inside	Gtl	CollgCr	Norm	Norm	1Fam	2Story	7	5	2001	2002	Gable	CompShg	VinylSd	VinylSd	BrkFace	162.0	Gd	TA	PConc	Gd	TA	Mn	GLQ	486	Unf	434	920	GasA	Ex	Y	SBrkr	920	866	1786	1	0	2	1	3	1	Gd	6	Typ	1	TA	Attchd	2001.0	RFn	2	608	TA	TA	Y	0	42	0	NaN	NaN	NaN	9	2008	WD	Normal	223500
3	4	70	RL	60.0	9550	Pave	NaN	IR1	Lvl	AllPub	Corner	Gtl	Crawfor	Norm	Norm	1Fam	2Story	7	5	1915	1970	Gable	CompShg	Wd Sdng	Wd Shng	NaN	0.0	TA	TA	BrkTil	TA	Gd	No	ALQ	216	Unf	540	756	GasA	Gd	Y	SBrkr	961	756	1717	1	0	1	0	3	1	Gd	7	Typ	1	Gd	Detchd	1998.0	Unf	3	642	TA	TA	Y	0	35	272	NaN	NaN	NaN	2	2006	WD	Abnorml	140000
4	5	60	RL	84.0	14260	Pave	NaN	IR1	Lvl	AllPub	FR2	Gtl	NoRidge	Norm	Norm	1Fam	2Story	8	5	2000	2000	Gable	CompShg	VinylSd	VinylSd	BrkFace	350.0	Gd	TA	PConc	Gd	TA	Av	GLQ	655	Unf	490	1145	GasA	Ex	Y	SBrkr	1145	1053	2198	1	0	2	1	4	1	Gd	9	Typ	1	TA	Attchd	2000.0	RFn	3	836	TA	TA	Y	192	84	0	NaN	NaN	NaN	12	2008	WD	Normal	250000

🏠 Predicting House Prices in Ames, Iowa¶

Contributions¶

Introduction¶

🏠 Predicting House Prices in Ames, Iowa¶

Contributions¶

Introduction¶

🏠 Predicting House Prices in Ames, Iowa¶