Project Cost Estimation Example

Example of Dummy Data

Code

import pandas as pd
import numpy as np

# Create dummy data
np.random.seed(42)  # For reproducibility

data = {
    'ProjectID': range(1, 101),
    'Duration': np.random.randint(10, 50, size=100),
    'TeamSize': np.random.randint(5, 20, size=100),
    'Complexity': np.random.randint(1, 10, size=100),
    'InitialBudget': np.random.randint(50, 200, size=100),
    'PriorIssues': np.random.randint(0, 5, size=100),
    'ProjectType': np.random.choice(['Residential', 'Commercial', 'Industrial'], size=100),
    'TeamExperience': np.random.randint(1, 15, size=100),
    'ExternalFactors': np.random.uniform(0.5, 1.5, size=100),
    'ResourceAvailability': np.random.randint(1, 10, size=100),
    'ProjectPhase': np.random.choice(['Design', 'Foundation', 'Structure', 'Finishing'], size=100),
    'ActualCost': np.random.randint(60, 250, size=100)
}

df = pd.DataFrame(data)

# first few rows of the dataframe
df.head()

	ProjectID	Duration	TeamSize	Complexity	InitialBudget	PriorIssues	ProjectType	TeamExperience	ExternalFactors	ResourceAvailability	ProjectPhase	ActualCost
0	1	48	11	8	117	4	Commercial	5	0.577735	2	Foundation	121
1	2	38	11	9	82	3	Commercial	6	1.474395	3	Design	191
2	3	24	18	4	191	4	Commercial	3	1.486211	3	Foundation	148
3	4	17	12	1	70	3	Commercial	8	1.198162	5	Structure	101
4	5	30	9	1	97	2	Commercial	13	1.036096	5	Structure	148

Data Prep

Code

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Convert categorical features to numerical values
label_encoders = {}
categorical_features = ['ProjectType', 'ProjectPhase']
for feature in categorical_features:
    le = LabelEncoder()
    df[feature] = le.fit_transform(df[feature])
    label_encoders[feature] = le

# Split the data into features (X) and target (y)
X = df.drop(columns=['ProjectID', 'ActualCost'])
y = df['ActualCost']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

Model Training

Code

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Initialize the RandomForestRegressor
rf = RandomForestRegressor(random_state=42)

# Train the model
rf.fit(X_train, y_train)

# Predict on the test set
y_pred = rf.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Output the results
print("Mean Squared Error:", mse)
print("R^2 Score:", r2)

Mean Squared Error: 3473.57541
R^2 Score: -0.1123097958274566

Hyperparameter Tuning

Code

from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize GridSearchCV with 3-fold cross-validation
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, n_jobs=-1, verbose=0)

# Fit GridSearchCV to the data
grid_search.fit(X_train, y_train)

# Best parameters and model performance
best_params = grid_search.best_params_
best_rf = grid_search.best_estimator_

# print("Best Parameters from GridSearchCV:")
# print(best_params)

# Predict on the test set with the best model
y_pred_best = best_rf.predict(X_test)

# Evaluate the best model
mse_best = mean_squared_error(y_test, y_pred_best)
r2_best = r2_score(y_test, y_pred_best)

# Output the results
print("Mean Squared Error (Best Model):", mse_best)
print("R^2 Score (Best Model):", r2_best)

Mean Squared Error (Best Model): 3345.293177772035
R^2 Score (Best Model): -0.07123120483818601

Feature Importance

Code

import matplotlib.pyplot as plt
import seaborn as sns

# Extract feature importances from the best model
feature_importances = best_rf.feature_importances_

# Create a DataFrame for better visualization
features_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': feature_importances
})

# Sort the DataFrame by importance
features_df = features_df.sort_values(by='Importance', ascending=False)

# Display the feature importances
print("Feature Importances:")
print(features_df)

# Plotting the feature importances
plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=features_df, palette='viridis')
plt.title('Feature Importances in Random Forest Model')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.show()

Feature Importances:
                Feature  Importance
8  ResourceAvailability    0.184655
3         InitialBudget    0.172776
1              TeamSize    0.156746
0              Duration    0.111923
7       ExternalFactors    0.105970
6        TeamExperience    0.092655
4           PriorIssues    0.066102
2            Complexity    0.062543
5           ProjectType    0.029834
9          ProjectPhase    0.016797