examin the impact on participation rates due to different ethnicities

anushkasaxena07 · anushkasaxena07 · commit ac0b8e46bea3 · 2024-06-30T16:26:29.000+05:30
diff --git a/.ipynb_checkpoints/Stackoverflow_Survey_Analysis-checkpoint.ipynb b/.ipynb_checkpoints/Stackoverflow_Survey_Analysis-checkpoint.ipynb
@@ -21729,33 +21729,97 @@
     "plt.show()\n"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Examine the impact on participation rates due to different ethnicities."
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
    "outputs": [],
-   "source": []
+   "source": [
+    "import pandas as pd\n",
+    "\n",
+    "# Load the CSV files\n",
+    "file1 = pd.read_csv(r\"Data/survey_results_public_2018.csv\")\n",
+    "file2 = pd.read_csv(r\"Data/survey_results_public_2019.csv\")\n",
+    "file3 = pd.read_csv(r\"Data/survey_results_public_2020.csv\")\n",
+    "\n",
+    "# Merge the data\n",
+    "merged_data = pd.concat([file1, file2, file3], ignore_index=True)\n"
+   ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
    "outputs": [],
-   "source": []
+   "source": [
+    "def preprocess_data(df):\n",
+    "    # Convert compensation to numeric, ignoring non-numeric values\n",
+    "    df['ConvertedComp'] = pd.to_numeric(df['ConvertedComp'], errors='coerce')\n",
+    "    \n",
+    "    # Fill missing values in relevant columns\n",
+    "    df['EdLevel'] = df['EdLevel'].fillna('Unknown')\n",
+    "    df['YearsCodePro'] = pd.to_numeric(df['YearsCodePro'], errors='coerce').fillna(0)\n",
+    "    df['Gender'] = df['Gender'].fillna('Unknown')\n",
+    "    df['Ethnicity'] = df['Ethnicity'].fillna('Unknown')\n",
+    "    \n",
+    "    # Drop rows where ConvertedComp is NaN\n",
+    "    df = df.dropna(subset=['ConvertedComp'])\n",
+    "    \n",
+    "    return df\n",
+    "\n",
+    "# Preprocess the data\n",
+    "cleaned_data = preprocess_data(merged_data)\n"
+   ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
    "outputs": [],
-   "source": []
+   "source": [
+    "# Count the number of respondents by ethnicity\n",
+    "ethnicity_counts = cleaned_data['Ethnicity'].value_counts()\n",
+    "\n",
+    "# Calculate average salary by ethnicity\n",
+    "avg_salary_by_ethnicity = cleaned_data.groupby('Ethnicity')['ConvertedComp'].mean()\n",
+    "\n",
+    "print(\"Number of Respondents by Ethnicity:\")\n",
+    "print(ethnicity_counts)\n",
+    "print(\"\\nAverage Salary by Ethnicity:\")\n",
+    "print(avg_salary_by_ethnicity)\n"
+   ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
    "outputs": [],
-   "source": []
+   "source": [
+    "import matplotlib.pyplot as plt\n",
+    "\n",
+    "# Number of Respondents by Ethnicity\n",
+    "plt.figure(figsize=(10, 6))\n",
+    "ethnicity_counts.plot(kind='bar')\n",
+    "plt.title('Number of Respondents by Ethnicity')\n",
+    "plt.xlabel('Ethnicity')\n",
+    "plt.ylabel('Number of Respondents')\n",
+    "plt.show()\n",
+    "\n",
+    "# Average Salary by Ethnicity\n",
+    "plt.figure(figsize=(10, 6))\n",
+    "avg_salary_by_ethnicity.plot(kind='bar')\n",
+    "plt.title('Average Salary by Ethnicity')\n",
+    "plt.xlabel('Ethnicity')\n",
+    "plt.ylabel('Average Salary (ConvertedComp)')\n",
+    "plt.show()\n"
+   ]
   },
   {
    "cell_type": "markdown",
diff --git a/Stackoverflow_Survey_Analysis.ipynb b/Stackoverflow_Survey_Analysis.ipynb
@@ -21729,33 +21729,97 @@
     "plt.show()\n"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Examine the impact on participation rates due to different ethnicities."
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
    "outputs": [],
-   "source": []
+   "source": [
+    "import pandas as pd\n",
+    "\n",
+    "# Load the CSV files\n",
+    "file1 = pd.read_csv(r\"Data/survey_results_public_2018.csv\")\n",
+    "file2 = pd.read_csv(r\"Data/survey_results_public_2019.csv\")\n",
+    "file3 = pd.read_csv(r\"Data/survey_results_public_2020.csv\")\n",
+    "\n",
+    "# Merge the data\n",
+    "merged_data = pd.concat([file1, file2, file3], ignore_index=True)\n"
+   ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
    "outputs": [],
-   "source": []
+   "source": [
+    "def preprocess_data(df):\n",
+    "    # Convert compensation to numeric, ignoring non-numeric values\n",
+    "    df['ConvertedComp'] = pd.to_numeric(df['ConvertedComp'], errors='coerce')\n",
+    "    \n",
+    "    # Fill missing values in relevant columns\n",
+    "    df['EdLevel'] = df['EdLevel'].fillna('Unknown')\n",
+    "    df['YearsCodePro'] = pd.to_numeric(df['YearsCodePro'], errors='coerce').fillna(0)\n",
+    "    df['Gender'] = df['Gender'].fillna('Unknown')\n",
+    "    df['Ethnicity'] = df['Ethnicity'].fillna('Unknown')\n",
+    "    \n",
+    "    # Drop rows where ConvertedComp is NaN\n",
+    "    df = df.dropna(subset=['ConvertedComp'])\n",
+    "    \n",
+    "    return df\n",
+    "\n",
+    "# Preprocess the data\n",
+    "cleaned_data = preprocess_data(merged_data)\n"
+   ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
    "outputs": [],
-   "source": []
+   "source": [
+    "# Count the number of respondents by ethnicity\n",
+    "ethnicity_counts = cleaned_data['Ethnicity'].value_counts()\n",
+    "\n",
+    "# Calculate average salary by ethnicity\n",
+    "avg_salary_by_ethnicity = cleaned_data.groupby('Ethnicity')['ConvertedComp'].mean()\n",
+    "\n",
+    "print(\"Number of Respondents by Ethnicity:\")\n",
+    "print(ethnicity_counts)\n",
+    "print(\"\\nAverage Salary by Ethnicity:\")\n",
+    "print(avg_salary_by_ethnicity)\n"
+   ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
    "outputs": [],
-   "source": []
+   "source": [
+    "import matplotlib.pyplot as plt\n",
+    "\n",
+    "# Number of Respondents by Ethnicity\n",
+    "plt.figure(figsize=(10, 6))\n",
+    "ethnicity_counts.plot(kind='bar')\n",
+    "plt.title('Number of Respondents by Ethnicity')\n",
+    "plt.xlabel('Ethnicity')\n",
+    "plt.ylabel('Number of Respondents')\n",
+    "plt.show()\n",
+    "\n",
+    "# Average Salary by Ethnicity\n",
+    "plt.figure(figsize=(10, 6))\n",
+    "avg_salary_by_ethnicity.plot(kind='bar')\n",
+    "plt.title('Average Salary by Ethnicity')\n",
+    "plt.xlabel('Ethnicity')\n",
+    "plt.ylabel('Average Salary (ConvertedComp)')\n",
+    "plt.show()\n"
+   ]
   },
   {
    "cell_type": "markdown",