Skip to content

Commit ed0dc7f

Browse files
authored
Merge pull request #381 from Ankush0286/main
Improved Error Handling
2 parents f7588ee + 8c8524f commit ed0dc7f

File tree

1 file changed

+135
-131
lines changed

1 file changed

+135
-131
lines changed

opensource_analysis/app.py

Lines changed: 135 additions & 131 deletions
Original file line numberDiff line numberDiff line change
@@ -18,134 +18,138 @@
1818
if not os.path.exists(file_path):
1919
st.error(f"File not found: {file_path}. Please ensure the file is in the correct directory.")
2020
else:
21-
# Load the dataset
22-
data = pd.read_csv(file_path)
23-
24-
# Define the necessary columns
25-
columns = ['Employment', 'FormalEducation', 'CompanySize', 'DevType', 'Exercise', 'Age', 'OpenSource']
26-
data = data[columns].copy()
27-
28-
# Map age values to numerical values
29-
age_mapping = {
30-
'Under 18 years old': 0,
31-
'18 - 24 years old': 1,
32-
'25 - 34 years old': 2,
33-
'35 - 44 years old': 3,
34-
'45 - 54 years old': 4,
35-
'55 - 64 years old': 5,
36-
'65 years or older': 6
37-
}
38-
data['Age'] = data['Age'].map(age_mapping)
39-
40-
# Define target variable and feature columns
41-
target_variable = 'OpenSource'
42-
categorical_features = ['Employment', 'FormalEducation', 'CompanySize', 'DevType', 'Exercise', 'Age']
43-
numerical_features = []
44-
45-
# Preprocessing for categorical data
46-
preprocessor = ColumnTransformer(
47-
transformers=[
48-
('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
49-
]
50-
)
51-
52-
# Split the data
53-
X = data.drop(target_variable, axis=1)
54-
y = data[target_variable]
55-
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
56-
57-
# Create and train the model
58-
model = Pipeline(steps=[
59-
('preprocessor', preprocessor),
60-
('classifier', RandomForestClassifier(random_state=42))
61-
])
62-
model.fit(X_train, y_train)
63-
64-
# Evaluate the model
65-
y_pred = model.predict(X_test)
66-
classification_rep = classification_report(y_test, y_pred)
67-
roc_auc = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])
68-
69-
# Get feature importance
70-
importances = model.named_steps['classifier'].feature_importances_
71-
feature_names = list(model.named_steps['preprocessor'].transformers_[0][1].get_feature_names_out())
72-
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances}).sort_values(by='Importance', ascending=False)
73-
74-
# Streamlit App
75-
st.title('Machine Learning Model Evaluation')
76-
77-
# Show classification report
78-
st.header('Classification Report')
79-
st.text(classification_rep)
80-
81-
# Show ROC-AUC Score
82-
st.header('ROC-AUC Score')
83-
st.text(f"ROC-AUC Score: {roc_auc:.2f}")
84-
85-
# Plot confusion matrix
86-
st.header('Confusion Matrix')
87-
cm = confusion_matrix(y_test, y_pred)
88-
fig, ax = plt.subplots()
89-
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['No', 'Yes'], yticklabels=['No', 'Yes'], ax=ax)
90-
plt.xlabel('Predicted')
91-
plt.ylabel('Actual')
92-
st.pyplot(fig)
93-
94-
# Plot ROC Curve
95-
st.header('ROC Curve')
96-
y_test_binary = y_test.map({'No': 0, 'Yes': 1})
97-
fpr, tpr, _ = roc_curve(y_test_binary, model.predict_proba(X_test)[:, 1])
98-
roc_auc = auc(fpr, tpr)
99-
fig, ax = plt.subplots()
100-
ax.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
101-
ax.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
102-
ax.set_xlim([0.0, 1.0])
103-
ax.set_ylim([0.0, 1.05])
104-
ax.set_xlabel('False Positive Rate')
105-
ax.set_ylabel('True Positive Rate')
106-
ax.set_title('ROC Curve')
107-
ax.legend(loc='lower right')
108-
st.pyplot(fig)
109-
110-
# Plot feature importance
111-
st.header('Feature Importance')
112-
fig, ax = plt.subplots()
113-
sns.barplot(x='Importance', y='Feature', data=feature_importance_df.head(20), palette='viridis', ax=ax)
114-
ax.set_title('Top Feature Importances')
115-
ax.set_xlabel('Importance')
116-
ax.set_ylabel('Feature')
117-
st.pyplot(fig)
118-
119-
# Section for new data input and prediction
120-
st.header('Predict for New Data')
121-
122-
# Input fields for new data
123-
employment = st.selectbox('Employment', data['Employment'].unique())
124-
education = st.selectbox('Formal Education', data['FormalEducation'].unique())
125-
company_size = st.selectbox('Company Size', data['CompanySize'].unique())
126-
dev_type = st.selectbox('Dev Type', data['DevType'].unique())
127-
exercise = st.selectbox('Exercise', data['Exercise'].unique())
128-
age = st.selectbox('Age', list(age_mapping.keys()))
129-
130-
# Convert inputs to dataframe
131-
new_data = pd.DataFrame({
132-
'Employment': [employment],
133-
'FormalEducation': [education],
134-
'CompanySize': [company_size],
135-
'DevType': [dev_type],
136-
'Exercise': [exercise],
137-
'Age': [age_mapping[age]]
138-
})
139-
140-
# Handle any NaN values
141-
new_data = new_data.fillna('')
142-
143-
# Predict the output for new data
144-
if st.button('Predict'):
145-
try:
146-
prediction = model.predict(new_data)
147-
prediction_prob = model.predict_proba(new_data)[:, 1]
148-
st.write(f'Prediction: {"Yes" if prediction[0] == "Yes" else "No"}')
149-
st.write(f'Prediction Probability: {prediction_prob[0]:.2f}')
150-
except Exception as e:
151-
st.error(f"An error occurred during prediction: {e}")
21+
try:
22+
# Try to load the dataset
23+
data = pd.read_csv(file_path)
24+
25+
# Define the necessary columns
26+
columns = ['Employment', 'FormalEducation', 'CompanySize', 'DevType', 'Exercise', 'Age', 'OpenSource']
27+
data = data[columns].copy()
28+
29+
# Map age values to numerical values
30+
age_mapping = {
31+
'Under 18 years old': 0,
32+
'18 - 24 years old': 1,
33+
'25 - 34 years old': 2,
34+
'35 - 44 years old': 3,
35+
'45 - 54 years old': 4,
36+
'55 - 64 years old': 5,
37+
'65 years or older': 6
38+
}
39+
data['Age'] = data['Age'].map(age_mapping)
40+
41+
# Define target variable and feature columns
42+
target_variable = 'OpenSource'
43+
categorical_features = ['Employment', 'FormalEducation', 'CompanySize', 'DevType', 'Exercise', 'Age']
44+
numerical_features = []
45+
46+
# Preprocessing for categorical data
47+
preprocessor = ColumnTransformer(
48+
transformers=[
49+
('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
50+
]
51+
)
52+
53+
# Split the data
54+
X = data.drop(target_variable, axis=1)
55+
y = data[target_variable]
56+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
57+
58+
# Create and train the model
59+
model = Pipeline(steps=[
60+
('preprocessor', preprocessor),
61+
('classifier', RandomForestClassifier(random_state=42))
62+
])
63+
model.fit(X_train, y_train)
64+
65+
# Evaluate the model
66+
y_pred = model.predict(X_test)
67+
classification_rep = classification_report(y_test, y_pred)
68+
roc_auc = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])
69+
70+
# Get feature importance
71+
importances = model.named_steps['classifier'].feature_importances_
72+
feature_names = list(model.named_steps['preprocessor'].transformers_[0][1].get_feature_names_out())
73+
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances}).sort_values(by='Importance', ascending=False)
74+
75+
# Streamlit App
76+
st.title('Machine Learning Model Evaluation')
77+
78+
# Show classification report
79+
st.header('Classification Report')
80+
st.text(classification_rep)
81+
82+
# Show ROC-AUC Score
83+
st.header('ROC-AUC Score')
84+
st.text(f"ROC-AUC Score: {roc_auc:.2f}")
85+
86+
# Plot confusion matrix
87+
st.header('Confusion Matrix')
88+
cm = confusion_matrix(y_test, y_pred)
89+
fig, ax = plt.subplots()
90+
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['No', 'Yes'], yticklabels=['No', 'Yes'], ax=ax)
91+
plt.xlabel('Predicted')
92+
plt.ylabel('Actual')
93+
st.pyplot(fig)
94+
95+
# Plot ROC Curve
96+
st.header('ROC Curve')
97+
y_test_binary = y_test.map({'No': 0, 'Yes': 1})
98+
fpr, tpr, _ = roc_curve(y_test_binary, model.predict_proba(X_test)[:, 1])
99+
roc_auc = auc(fpr, tpr)
100+
fig, ax = plt.subplots()
101+
ax.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
102+
ax.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
103+
ax.set_xlim([0.0, 1.0])
104+
ax.set_ylim([0.0, 1.05])
105+
ax.set_xlabel('False Positive Rate')
106+
ax.set_ylabel('True Positive Rate')
107+
ax.set_title('ROC Curve')
108+
ax.legend(loc='lower right')
109+
st.pyplot(fig)
110+
111+
# Plot feature importance
112+
st.header('Feature Importance')
113+
fig, ax = plt.subplots()
114+
sns.barplot(x='Importance', y='Feature', data=feature_importance_df.head(20), palette='viridis', ax=ax)
115+
ax.set_title('Top Feature Importances')
116+
ax.set_xlabel('Importance')
117+
ax.set_ylabel('Feature')
118+
st.pyplot(fig)
119+
120+
# Section for new data input and prediction
121+
st.header('Predict for New Data')
122+
123+
# Input fields for new data
124+
employment = st.selectbox('Employment', data['Employment'].unique())
125+
education = st.selectbox('Formal Education', data['FormalEducation'].unique())
126+
company_size = st.selectbox('Company Size', data['CompanySize'].unique())
127+
dev_type = st.selectbox('Dev Type', data['DevType'].unique())
128+
exercise = st.selectbox('Exercise', data['Exercise'].unique())
129+
age = st.selectbox('Age', list(age_mapping.keys()))
130+
131+
# Convert inputs to dataframe
132+
new_data = pd.DataFrame({
133+
'Employment': [employment],
134+
'FormalEducation': [education],
135+
'CompanySize': [company_size],
136+
'DevType': [dev_type],
137+
'Exercise': [exercise],
138+
'Age': [age_mapping[age]]
139+
})
140+
141+
# Handle any NaN values
142+
new_data = new_data.fillna('')
143+
144+
# Predict the output for new data
145+
if st.button('Predict'):
146+
try:
147+
prediction = model.predict(new_data)
148+
prediction_prob = model.predict_proba(new_data)[:, 1]
149+
st.write(f'Prediction: {"Yes" if prediction[0] == "Yes" else "No"}')
150+
st.write(f'Prediction Probability: {prediction_prob[0]:.2f}')
151+
except Exception as e:
152+
st.error(f"An error occurred during prediction: {e}")
153+
154+
except Exception as e:
155+
st.error(f"An error occurred while loading data: {e}")

0 commit comments

Comments
 (0)