Skip to content

Commit c1cd726

Browse files
authored
Merge pull request #318 from ChethanaPotukanam/os_analysis
Open Source Analysis #315
2 parents dcdcad6 + 583125a commit c1cd726

File tree

5 files changed

+1478
-0
lines changed

5 files changed

+1478
-0
lines changed

opensource_analysis/README

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
# Stackoverflow Analysis Project
2+
3+
## Setup Instructions
4+
5+
1. **Download and Extract the Project Folder**
6+
- Download the project folder and extract it to a desired location on your computer.
7+
8+
2. **Navigate to the Project Directory**
9+
```bash
10+
cd /path/to/extracted/project/folder/opensource_analysis
11+
12+
13+
## Install the Dependencies
14+
pip install -r requirements.txt
15+
16+
## Run the Streamlit App
17+
streamlit run app.py
18+
19+
## Access the App
20+
Open the URL http://localhost:8501 in your web browser to access the Streamlit app

opensource_analysis/app.py

Lines changed: 151 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,151 @@
1+
import os
2+
import streamlit as st
3+
import pandas as pd
4+
import numpy as np
5+
from sklearn.model_selection import train_test_split
6+
from sklearn.preprocessing import StandardScaler, OneHotEncoder
7+
from sklearn.compose import ColumnTransformer
8+
from sklearn.pipeline import Pipeline
9+
from sklearn.ensemble import RandomForestClassifier
10+
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix, roc_curve, auc
11+
import matplotlib.pyplot as plt
12+
import seaborn as sns
13+
14+
# Define the path to the data file
15+
file_path = 'survey_results_sample_2018.csv'
16+
17+
# Check if the file exists
18+
if not os.path.exists(file_path):
19+
st.error(f"File not found: {file_path}. Please ensure the file is in the correct directory.")
20+
else:
21+
# Load the dataset
22+
data = pd.read_csv(file_path)
23+
24+
# Define the necessary columns
25+
columns = ['Employment', 'FormalEducation', 'CompanySize', 'DevType', 'Exercise', 'Age', 'OpenSource']
26+
data = data[columns].copy()
27+
28+
# Map age values to numerical values
29+
age_mapping = {
30+
'Under 18 years old': 0,
31+
'18 - 24 years old': 1,
32+
'25 - 34 years old': 2,
33+
'35 - 44 years old': 3,
34+
'45 - 54 years old': 4,
35+
'55 - 64 years old': 5,
36+
'65 years or older': 6
37+
}
38+
data['Age'] = data['Age'].map(age_mapping)
39+
40+
# Define target variable and feature columns
41+
target_variable = 'OpenSource'
42+
categorical_features = ['Employment', 'FormalEducation', 'CompanySize', 'DevType', 'Exercise', 'Age']
43+
numerical_features = []
44+
45+
# Preprocessing for categorical data
46+
preprocessor = ColumnTransformer(
47+
transformers=[
48+
('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
49+
]
50+
)
51+
52+
# Split the data
53+
X = data.drop(target_variable, axis=1)
54+
y = data[target_variable]
55+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
56+
57+
# Create and train the model
58+
model = Pipeline(steps=[
59+
('preprocessor', preprocessor),
60+
('classifier', RandomForestClassifier(random_state=42))
61+
])
62+
model.fit(X_train, y_train)
63+
64+
# Evaluate the model
65+
y_pred = model.predict(X_test)
66+
classification_rep = classification_report(y_test, y_pred)
67+
roc_auc = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])
68+
69+
# Get feature importance
70+
importances = model.named_steps['classifier'].feature_importances_
71+
feature_names = list(model.named_steps['preprocessor'].transformers_[0][1].get_feature_names_out())
72+
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances}).sort_values(by='Importance', ascending=False)
73+
74+
# Streamlit App
75+
st.title('Machine Learning Model Evaluation')
76+
77+
# Show classification report
78+
st.header('Classification Report')
79+
st.text(classification_rep)
80+
81+
# Show ROC-AUC Score
82+
st.header('ROC-AUC Score')
83+
st.text(f"ROC-AUC Score: {roc_auc:.2f}")
84+
85+
# Plot confusion matrix
86+
st.header('Confusion Matrix')
87+
cm = confusion_matrix(y_test, y_pred)
88+
fig, ax = plt.subplots()
89+
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['No', 'Yes'], yticklabels=['No', 'Yes'], ax=ax)
90+
plt.xlabel('Predicted')
91+
plt.ylabel('Actual')
92+
st.pyplot(fig)
93+
94+
# Plot ROC Curve
95+
st.header('ROC Curve')
96+
y_test_binary = y_test.map({'No': 0, 'Yes': 1})
97+
fpr, tpr, _ = roc_curve(y_test_binary, model.predict_proba(X_test)[:, 1])
98+
roc_auc = auc(fpr, tpr)
99+
fig, ax = plt.subplots()
100+
ax.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
101+
ax.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
102+
ax.set_xlim([0.0, 1.0])
103+
ax.set_ylim([0.0, 1.05])
104+
ax.set_xlabel('False Positive Rate')
105+
ax.set_ylabel('True Positive Rate')
106+
ax.set_title('ROC Curve')
107+
ax.legend(loc='lower right')
108+
st.pyplot(fig)
109+
110+
# Plot feature importance
111+
st.header('Feature Importance')
112+
fig, ax = plt.subplots()
113+
sns.barplot(x='Importance', y='Feature', data=feature_importance_df.head(20), palette='viridis', ax=ax)
114+
ax.set_title('Top Feature Importances')
115+
ax.set_xlabel('Importance')
116+
ax.set_ylabel('Feature')
117+
st.pyplot(fig)
118+
119+
# Section for new data input and prediction
120+
st.header('Predict for New Data')
121+
122+
# Input fields for new data
123+
employment = st.selectbox('Employment', data['Employment'].unique())
124+
education = st.selectbox('Formal Education', data['FormalEducation'].unique())
125+
company_size = st.selectbox('Company Size', data['CompanySize'].unique())
126+
dev_type = st.selectbox('Dev Type', data['DevType'].unique())
127+
exercise = st.selectbox('Exercise', data['Exercise'].unique())
128+
age = st.selectbox('Age', list(age_mapping.keys()))
129+
130+
# Convert inputs to dataframe
131+
new_data = pd.DataFrame({
132+
'Employment': [employment],
133+
'FormalEducation': [education],
134+
'CompanySize': [company_size],
135+
'DevType': [dev_type],
136+
'Exercise': [exercise],
137+
'Age': [age_mapping[age]]
138+
})
139+
140+
# Handle any NaN values
141+
new_data = new_data.fillna('')
142+
143+
# Predict the output for new data
144+
if st.button('Predict'):
145+
try:
146+
prediction = model.predict(new_data)
147+
prediction_prob = model.predict_proba(new_data)[:, 1]
148+
st.write(f'Prediction: {"Yes" if prediction[0] == "Yes" else "No"}')
149+
st.write(f'Prediction Probability: {prediction_prob[0]:.2f}')
150+
except Exception as e:
151+
st.error(f"An error occurred during prediction: {e}")

0 commit comments

Comments
 (0)