Skip to content

Commit 4292ba5

Browse files
Determine if there is a difference between men's and women's incomes.
1 parent ac0b8e4 commit 4292ba5

File tree

2 files changed

+196
-0
lines changed

2 files changed

+196
-0
lines changed

.ipynb_checkpoints/Stackoverflow_Survey_Analysis-checkpoint.ipynb

Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21821,6 +21821,104 @@
2182121821
"plt.show()\n"
2182221822
]
2182321823
},
21824+
{
21825+
"cell_type": "markdown",
21826+
"metadata": {},
21827+
"source": [
21828+
"# Determine if there is a difference between men's and women's incomes."
21829+
]
21830+
},
21831+
{
21832+
"cell_type": "code",
21833+
"execution_count": 1,
21834+
"metadata": {},
21835+
"outputs": [
21836+
{
21837+
"ename": "FileNotFoundError",
21838+
"evalue": "[Errno 2] No such file or directory: 'Data/survey_results_public_2018.csv'",
21839+
"output_type": "error",
21840+
"traceback": [
21841+
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
21842+
"\u001b[1;31mFileNotFoundError\u001b[0m Traceback (most recent call last)",
21843+
"Cell \u001b[1;32mIn[1], line 5\u001b[0m\n\u001b[0;32m 2\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mscipy\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m stats\n\u001b[0;32m 4\u001b[0m \u001b[38;5;66;03m# Load the CSV files\u001b[39;00m\n\u001b[1;32m----> 5\u001b[0m file1 \u001b[38;5;241m=\u001b[39m pd\u001b[38;5;241m.\u001b[39mread_csv(\u001b[38;5;124mr\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mData/survey_results_public_2018.csv\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m 6\u001b[0m file2 \u001b[38;5;241m=\u001b[39m pd\u001b[38;5;241m.\u001b[39mread_csv(\u001b[38;5;124mr\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mData/survey_results_public_2019.csv\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m 7\u001b[0m file3 \u001b[38;5;241m=\u001b[39m pd\u001b[38;5;241m.\u001b[39mread_csv(\u001b[38;5;124mr\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mData/survey_results_public_2020.csv\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
21844+
"File \u001b[1;32m~\\anaconda3\\Lib\\site-packages\\pandas\\util\\_decorators.py:211\u001b[0m, in \u001b[0;36mdeprecate_kwarg.<locals>._deprecate_kwarg.<locals>.wrapper\u001b[1;34m(*args, **kwargs)\u001b[0m\n\u001b[0;32m 209\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m 210\u001b[0m kwargs[new_arg_name] \u001b[38;5;241m=\u001b[39m new_arg_value\n\u001b[1;32m--> 211\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m func(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n",
21845+
"File \u001b[1;32m~\\anaconda3\\Lib\\site-packages\\pandas\\util\\_decorators.py:331\u001b[0m, in \u001b[0;36mdeprecate_nonkeyword_arguments.<locals>.decorate.<locals>.wrapper\u001b[1;34m(*args, **kwargs)\u001b[0m\n\u001b[0;32m 325\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(args) \u001b[38;5;241m>\u001b[39m num_allow_args:\n\u001b[0;32m 326\u001b[0m warnings\u001b[38;5;241m.\u001b[39mwarn(\n\u001b[0;32m 327\u001b[0m msg\u001b[38;5;241m.\u001b[39mformat(arguments\u001b[38;5;241m=\u001b[39m_format_argument_list(allow_args)),\n\u001b[0;32m 328\u001b[0m \u001b[38;5;167;01mFutureWarning\u001b[39;00m,\n\u001b[0;32m 329\u001b[0m stacklevel\u001b[38;5;241m=\u001b[39mfind_stack_level(),\n\u001b[0;32m 330\u001b[0m )\n\u001b[1;32m--> 331\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m func(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n",
21846+
"File \u001b[1;32m~\\anaconda3\\Lib\\site-packages\\pandas\\io\\parsers\\readers.py:950\u001b[0m, in \u001b[0;36mread_csv\u001b[1;34m(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, error_bad_lines, warn_bad_lines, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options)\u001b[0m\n\u001b[0;32m 935\u001b[0m kwds_defaults \u001b[38;5;241m=\u001b[39m _refine_defaults_read(\n\u001b[0;32m 936\u001b[0m dialect,\n\u001b[0;32m 937\u001b[0m delimiter,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 946\u001b[0m defaults\u001b[38;5;241m=\u001b[39m{\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdelimiter\u001b[39m\u001b[38;5;124m\"\u001b[39m: \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m,\u001b[39m\u001b[38;5;124m\"\u001b[39m},\n\u001b[0;32m 947\u001b[0m )\n\u001b[0;32m 948\u001b[0m kwds\u001b[38;5;241m.\u001b[39mupdate(kwds_defaults)\n\u001b[1;32m--> 950\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m _read(filepath_or_buffer, kwds)\n",
21847+
"File \u001b[1;32m~\\anaconda3\\Lib\\site-packages\\pandas\\io\\parsers\\readers.py:605\u001b[0m, in \u001b[0;36m_read\u001b[1;34m(filepath_or_buffer, kwds)\u001b[0m\n\u001b[0;32m 602\u001b[0m _validate_names(kwds\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mnames\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m))\n\u001b[0;32m 604\u001b[0m \u001b[38;5;66;03m# Create the parser.\u001b[39;00m\n\u001b[1;32m--> 605\u001b[0m parser \u001b[38;5;241m=\u001b[39m TextFileReader(filepath_or_buffer, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwds)\n\u001b[0;32m 607\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m chunksize \u001b[38;5;129;01mor\u001b[39;00m iterator:\n\u001b[0;32m 608\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m parser\n",
21848+
"File \u001b[1;32m~\\anaconda3\\Lib\\site-packages\\pandas\\io\\parsers\\readers.py:1442\u001b[0m, in \u001b[0;36mTextFileReader.__init__\u001b[1;34m(self, f, engine, **kwds)\u001b[0m\n\u001b[0;32m 1439\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moptions[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhas_index_names\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m kwds[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhas_index_names\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n\u001b[0;32m 1441\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles: IOHandles \u001b[38;5;241m|\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m-> 1442\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_engine \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_make_engine(f, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mengine)\n",
21849+
"File \u001b[1;32m~\\anaconda3\\Lib\\site-packages\\pandas\\io\\parsers\\readers.py:1735\u001b[0m, in \u001b[0;36mTextFileReader._make_engine\u001b[1;34m(self, f, engine)\u001b[0m\n\u001b[0;32m 1733\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mb\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m mode:\n\u001b[0;32m 1734\u001b[0m mode \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mb\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m-> 1735\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles \u001b[38;5;241m=\u001b[39m get_handle(\n\u001b[0;32m 1736\u001b[0m f,\n\u001b[0;32m 1737\u001b[0m mode,\n\u001b[0;32m 1738\u001b[0m encoding\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moptions\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mencoding\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m),\n\u001b[0;32m 1739\u001b[0m compression\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moptions\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcompression\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m),\n\u001b[0;32m 1740\u001b[0m memory_map\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moptions\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmemory_map\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mFalse\u001b[39;00m),\n\u001b[0;32m 1741\u001b[0m is_text\u001b[38;5;241m=\u001b[39mis_text,\n\u001b[0;32m 1742\u001b[0m errors\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moptions\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mencoding_errors\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mstrict\u001b[39m\u001b[38;5;124m\"\u001b[39m),\n\u001b[0;32m 1743\u001b[0m storage_options\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moptions\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mstorage_options\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m),\n\u001b[0;32m 1744\u001b[0m )\n\u001b[0;32m 1745\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m 1746\u001b[0m f \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles\u001b[38;5;241m.\u001b[39mhandle\n",
21850+
"File \u001b[1;32m~\\anaconda3\\Lib\\site-packages\\pandas\\io\\common.py:856\u001b[0m, in \u001b[0;36mget_handle\u001b[1;34m(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)\u001b[0m\n\u001b[0;32m 851\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(handle, \u001b[38;5;28mstr\u001b[39m):\n\u001b[0;32m 852\u001b[0m \u001b[38;5;66;03m# Check whether the filename is to be opened in binary mode.\u001b[39;00m\n\u001b[0;32m 853\u001b[0m \u001b[38;5;66;03m# Binary mode does not support 'encoding' and 'newline'.\u001b[39;00m\n\u001b[0;32m 854\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m ioargs\u001b[38;5;241m.\u001b[39mencoding \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mb\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m ioargs\u001b[38;5;241m.\u001b[39mmode:\n\u001b[0;32m 855\u001b[0m \u001b[38;5;66;03m# Encoding\u001b[39;00m\n\u001b[1;32m--> 856\u001b[0m handle \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mopen\u001b[39m(\n\u001b[0;32m 857\u001b[0m handle,\n\u001b[0;32m 858\u001b[0m ioargs\u001b[38;5;241m.\u001b[39mmode,\n\u001b[0;32m 859\u001b[0m encoding\u001b[38;5;241m=\u001b[39mioargs\u001b[38;5;241m.\u001b[39mencoding,\n\u001b[0;32m 860\u001b[0m errors\u001b[38;5;241m=\u001b[39merrors,\n\u001b[0;32m 861\u001b[0m newline\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m 862\u001b[0m )\n\u001b[0;32m 863\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m 864\u001b[0m \u001b[38;5;66;03m# Binary mode\u001b[39;00m\n\u001b[0;32m 865\u001b[0m handle \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mopen\u001b[39m(handle, ioargs\u001b[38;5;241m.\u001b[39mmode)\n",
21851+
"\u001b[1;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: 'Data/survey_results_public_2018.csv'"
21852+
]
21853+
}
21854+
],
21855+
"source": [
21856+
"import pandas as pd\n",
21857+
"from scipy import stats\n",
21858+
"\n",
21859+
"# Load the CSV files\n",
21860+
"file1 = pd.read_csv(r\"Data/survey_results_public_2018.csv\")\n",
21861+
"file2 = pd.read_csv(r\"Data/survey_results_public_2019.csv\")\n",
21862+
"file3 = pd.read_csv(r\"Data/survey_results_public_2020.csv\")\n",
21863+
"\n",
21864+
"# Merge the data\n",
21865+
"merged_data = pd.concat([file1, file2, file3], ignore_index=True)\n",
21866+
"\n",
21867+
"def preprocess_data(df):\n",
21868+
" # Convert compensation to numeric, ignoring non-numeric values\n",
21869+
" df['ConvertedComp'] = pd.to_numeric(df['ConvertedComp'], errors='coerce')\n",
21870+
" \n",
21871+
" # Fill missing values in relevant columns\n",
21872+
" df['EdLevel'] = df['EdLevel'].fillna('Unknown')\n",
21873+
" df['YearsCodePro'] = pd.to_numeric(df['YearsCodePro'], errors='coerce').fillna(0)\n",
21874+
" df['Gender'] = df['Gender'].fillna('Unknown')\n",
21875+
" \n",
21876+
" # Drop rows where ConvertedComp is NaN\n",
21877+
" df = df.dropna(subset=['ConvertedComp'])\n",
21878+
" \n",
21879+
" return df\n",
21880+
"\n",
21881+
"# Preprocess the data\n",
21882+
"cleaned_data = preprocess_data(merged_data)\n",
21883+
"\n",
21884+
"# Filter data for males and females\n",
21885+
"male_incomes = cleaned_data[cleaned_data['Gender'] == 'Male']['ConvertedComp']\n",
21886+
"female_incomes = cleaned_data[cleaned_data['Gender'] == 'Female']['ConvertedComp']\n",
21887+
"\n",
21888+
"# Perform t-test\n",
21889+
"t_stat, p_value = stats.ttest_ind(male_incomes, female_incomes, nan_policy='omit')\n",
21890+
"\n",
21891+
"print(f\"T-statistic: {t_stat}\")\n",
21892+
"print(f\"P-value: {p_value}\")\n",
21893+
"\n",
21894+
"# Interpretation\n",
21895+
"if p_value < 0.05:\n",
21896+
" print(\"There is a significant difference between men's and women's incomes.\")\n",
21897+
"else:\n",
21898+
" print(\"There is no significant difference between men's and women's incomes.\")\n"
21899+
]
21900+
},
21901+
{
21902+
"cell_type": "code",
21903+
"execution_count": null,
21904+
"metadata": {},
21905+
"outputs": [],
21906+
"source": []
21907+
},
21908+
{
21909+
"cell_type": "code",
21910+
"execution_count": null,
21911+
"metadata": {},
21912+
"outputs": [],
21913+
"source": []
21914+
},
21915+
{
21916+
"cell_type": "code",
21917+
"execution_count": null,
21918+
"metadata": {},
21919+
"outputs": [],
21920+
"source": []
21921+
},
2182421922
{
2182521923
"cell_type": "markdown",
2182621924
"metadata": {},

0 commit comments

Comments
 (0)