Project Resource Allocation Example

Example of Dummy Data

Code
import pandas as pd
import numpy as np

# Create dummy data
np.random.seed(42)  # For reproducibility

data = {
    'ProjectID': range(1, 101),
    'Duration': np.random.randint(10, 50, size=100),
    'TeamSize': np.random.randint(5, 20, size=100),
    'Complexity': np.random.randint(1, 10, size=100),
    'InitialBudget': np.random.randint(50, 200, size=100),
    'PriorIssues': np.random.randint(0, 5, size=100),
    'ProjectType': np.random.choice(['Residential', 'Commercial', 'Industrial'], size=100),
    'TeamExperience': np.random.randint(1, 15, size=100),
    'ExternalFactors': np.random.uniform(0.5, 1.5, size=100),
    'ResourceAvailability': np.random.randint(1, 10, size=100),
    'ProjectPhase': np.random.choice(['Design', 'Foundation', 'Structure', 'Finishing'], size=100),
    'ActualCost': np.random.randint(60, 250, size=100)
}

df = pd.DataFrame(data)

# first few rows of the dataframe
df.head()
ProjectID Duration TeamSize Complexity InitialBudget PriorIssues ProjectType TeamExperience ExternalFactors ResourceAvailability ProjectPhase ActualCost
0 1 48 11 8 117 4 Commercial 5 0.577735 2 Foundation 121
1 2 38 11 9 82 3 Commercial 6 1.474395 3 Design 191
2 3 24 18 4 191 4 Commercial 3 1.486211 3 Foundation 148
3 4 17 12 1 70 3 Commercial 8 1.198162 5 Structure 101
4 5 30 9 1 97 2 Commercial 13 1.036096 5 Structure 148

Data Preps

Code
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Convert categorical features to numerical values
label_encoders = {}
categorical_features = ['ProjectType', 'ProjectPhase']
for feature in categorical_features:
    le = LabelEncoder()
    df[feature] = le.fit_transform(df[feature])
    label_encoders[feature] = le

# Split the data into features (X) and target (y)
X = df.drop(columns=['ProjectID', 'ActualCost'])
y = df['ActualCost']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

Model Training and Evaluation

Code
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Initialize the RandomForestRegressor
rf = RandomForestRegressor(random_state=42)

# Train the model
rf.fit(X_train, y_train)

# Predict on the test set
y_pred = rf.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Output the results
print("Mean Squared Error:", mse)
print("R^2 Score:", r2)
Mean Squared Error: 3473.57541
R^2 Score: -0.1123097958274566

Hyperparameter Tuning

Code
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize GridSearchCV with 3-fold cross-validation
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)

# Fit GridSearchCV to the data
grid_search.fit(X_train, y_train)

# Best parameters and model performance
best_params = grid_search.best_params_
best_rf = grid_search.best_estimator_

#print("Best Parameters from GridSearchCV:")
#print(best_params)

# Predict on the test set with the best model
y_pred_best = best_rf.predict(X_test)

# Evaluate the best model
mse_best = mean_squared_error(y_test, y_pred_best)
r2_best = r2_score(y_test, y_pred_best)

# Output the results
print("Mean Squared Error (Best Model):", mse_best)
print("R^2 Score (Best Model):", r2_best)
Fitting 3 folds for each of 108 candidates, totalling 324 fits
Mean Squared Error (Best Model): 3345.293177772035
R^2 Score (Best Model): -0.07123120483818601

Feature Importance

Code
import matplotlib.pyplot as plt
import seaborn as sns

# Extract feature importances from the best model
feature_importances = best_rf.feature_importances_

# Create a DataFrame for better visualization
features_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': feature_importances
})

# Sort the DataFrame by importance
features_df = features_df.sort_values(by='Importance', ascending=False)

# Display the feature importances
print("Feature Importances:")
print(features_df)

# Plotting the feature importances
plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=features_df, palette='viridis')
plt.title('Feature Importances in Random Forest Model')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.show()
Feature Importances:
                Feature  Importance
8  ResourceAvailability    0.184655
3         InitialBudget    0.172776
1              TeamSize    0.156746
0              Duration    0.111923
7       ExternalFactors    0.105970
6        TeamExperience    0.092655
4           PriorIssues    0.066102
2            Complexity    0.062543
5           ProjectType    0.029834
9          ProjectPhase    0.016797