From 32ccb7f870d4f6b11c13a072cec7d92a2e7fe3ac Mon Sep 17 00:00:00 2001 From: Raakesh <cb.en.p2aid19023@cb.students.amrita.edu> Date: Thu, 2 Jul 2020 09:48:21 +0530 Subject: [PATCH] Preprocessing --- preprocessing-csv-rsna-ih-2019-19023.ipynb | 1 + 1 file changed, 1 insertion(+) create mode 100644 preprocessing-csv-rsna-ih-2019-19023.ipynb diff --git a/preprocessing-csv-rsna-ih-2019-19023.ipynb b/preprocessing-csv-rsna-ih-2019-19023.ipynb new file mode 100644 index 0000000..2c7db41 --- /dev/null +++ b/preprocessing-csv-rsna-ih-2019-19023.ipynb @@ -0,0 +1 @@ +{"cells":[{"metadata":{},"cell_type":"markdown","source":"## Preprocessing CSV's for training"},{"metadata":{},"cell_type":"markdown","source":""},{"metadata":{},"cell_type":"markdown","source":"Are you working a lot with Data Generators (for example Keras' \".flow_from_dataframe\") and competing in the [RSNA Intercranial Hemorrhage 2019 competition](https://www.kaggle.com/c/rsna-intracranial-hemorrhage-detection)? \n\nI've created a function that creates a simple preprocessed DataFrame with a column for ImageID and a column for each label in the competition. ('epidural', 'intraparenchymal', 'intraventricular', 'subarachnoid', 'subdural', 'any') \n\nI also made a function which translates your predictions into the correct submission format.\n\nIf you are interested in getting the metadata as CSV files also you can check out [this Kaggle kernel](https://www.kaggle.com/carlolepelaars/converting-dicom-metadata-to-csv-rsna-ihd-2019). \n\n"},{"metadata":{},"cell_type":"markdown","source":"## Preparation"},{"metadata":{"trusted":true},"cell_type":"code","source":"# We will only need OS and Pandas for this one\nimport os\nimport pandas as pd\n\n# Path names\nBASE_PATH = \"../input/rsna-intracranial-hemorrhage-detection/rsna-intracranial-hemorrhage-detection/\"\nTRAIN_PATH = BASE_PATH + 'stage_2_train.csv'\nTEST_PATH = BASE_PATH + 'stage_2_sample_submission.csv'\n\n# All labels that we have to predict in this competition\ntargets = ['epidural', 'intraparenchymal', \n 'intraventricular', 'subarachnoid', \n 'subdural', 'any']","execution_count":1,"outputs":[]},{"metadata":{"_kg_hide-input":true,"trusted":true},"cell_type":"code","source":"# File sizes and specifications\nprint('\\n# Files and file sizes')\nfor file in os.listdir(BASE_PATH)[2:]:\n print('{}| {} MB'.format(file.ljust(30), \n str(round(os.path.getsize(BASE_PATH + file) / 1000000, 2))))","execution_count":2,"outputs":[{"output_type":"stream","text":"\n# Files and file sizes\nstage_2_train | 26.59 MB\nstage_2_train.csv | 119.7 MB\n","name":"stdout"}]},{"metadata":{},"cell_type":"markdown","source":"## Preprocessing CSV's"},{"metadata":{"trusted":true},"cell_type":"code","source":"train_df = pd.read_csv(TRAIN_PATH)\ntrain_df['ImageID'] = train_df['ID'].str.rsplit('_', 1).map(lambda x: x[0]) + '.png'\nlabel_lists = train_df.groupby('ImageID')['Label'].apply(list)","execution_count":3,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"train_df[train_df['ImageID'] == 'ID_0002081b6.png']","execution_count":4,"outputs":[{"output_type":"execute_result","execution_count":4,"data":{"text/plain":" ID Label ImageID\n770232 ID_0002081b6_epidural 0 ID_0002081b6.png\n770233 ID_0002081b6_intraparenchymal 1 ID_0002081b6.png\n770234 ID_0002081b6_intraventricular 0 ID_0002081b6.png\n770235 ID_0002081b6_subarachnoid 0 ID_0002081b6.png\n770236 ID_0002081b6_subdural 0 ID_0002081b6.png\n770237 ID_0002081b6_any 1 ID_0002081b6.png","text/html":"<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>ID</th>\n <th>Label</th>\n <th>ImageID</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <td>770232</td>\n <td>ID_0002081b6_epidural</td>\n <td>0</td>\n <td>ID_0002081b6.png</td>\n </tr>\n <tr>\n <td>770233</td>\n <td>ID_0002081b6_intraparenchymal</td>\n <td>1</td>\n <td>ID_0002081b6.png</td>\n </tr>\n <tr>\n <td>770234</td>\n <td>ID_0002081b6_intraventricular</td>\n <td>0</td>\n <td>ID_0002081b6.png</td>\n </tr>\n <tr>\n <td>770235</td>\n <td>ID_0002081b6_subarachnoid</td>\n <td>0</td>\n <td>ID_0002081b6.png</td>\n </tr>\n <tr>\n <td>770236</td>\n <td>ID_0002081b6_subdural</td>\n <td>0</td>\n <td>ID_0002081b6.png</td>\n </tr>\n <tr>\n <td>770237</td>\n <td>ID_0002081b6_any</td>\n <td>1</td>\n <td>ID_0002081b6.png</td>\n </tr>\n </tbody>\n</table>\n</div>"},"metadata":{}}]},{"metadata":{"_uuid":"d629ff2d2480ee46fbb7e2d37f6b5fab8052498a","_cell_guid":"79c7e3d0-c299-4dcb-8224-4455121ee9b0","trusted":true},"cell_type":"code","source":"def prepare_df(path, train=False, nrows=None):\n \"\"\"\n Prepare Pandas DataFrame for fitting neural network models\n Returns a Dataframe with two columns\n ImageID and Labels (list of all labels for an image)\n \"\"\" \n df = pd.read_csv(path, nrows=nrows)\n \n # Get ImageID and type for pivoting\n df['ImageID'] = df['ID'].str.rsplit('_', 1).map(lambda x: x[0]) + '.png'\n df['type'] = df['ID'].str.split(\"_\", n = 3, expand = True)[2]\n # Create new DataFrame by pivoting\n new_df = df[['Label', 'ImageID', 'type']].drop_duplicates().pivot(index='ImageID', \n columns='type', \n values='Label').reset_index()\n return new_df","execution_count":5,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"# Convert dataframes to preprocessed format\ntrain_df = prepare_df(TRAIN_PATH, train=True)\ntest_df = prepare_df(TEST_PATH)","execution_count":6,"outputs":[]},{"metadata":{"_kg_hide-input":true,"trusted":true},"cell_type":"code","source":"print('Training data: ')\ndisplay(train_df.head())\n\nprint('Test data: ')\ntest_df.head()","execution_count":7,"outputs":[{"output_type":"stream","text":"Training data: \n","name":"stdout"},{"output_type":"display_data","data":{"text/plain":"type ImageID any epidural intraparenchymal intraventricular \\\n0 ID_000012eaf.png 0 0 0 0 \n1 ID_000039fa0.png 0 0 0 0 \n2 ID_00005679d.png 0 0 0 0 \n3 ID_00008ce3c.png 0 0 0 0 \n4 ID_0000950d7.png 0 0 0 0 \n\ntype subarachnoid subdural \n0 0 0 \n1 0 0 \n2 0 0 \n3 0 0 \n4 0 0 ","text/html":"<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th>type</th>\n <th>ImageID</th>\n <th>any</th>\n <th>epidural</th>\n <th>intraparenchymal</th>\n <th>intraventricular</th>\n <th>subarachnoid</th>\n <th>subdural</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <td>0</td>\n <td>ID_000012eaf.png</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n </tr>\n <tr>\n <td>1</td>\n <td>ID_000039fa0.png</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n </tr>\n <tr>\n <td>2</td>\n <td>ID_00005679d.png</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n </tr>\n <tr>\n <td>3</td>\n <td>ID_00008ce3c.png</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n </tr>\n <tr>\n <td>4</td>\n <td>ID_0000950d7.png</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n </tr>\n </tbody>\n</table>\n</div>"},"metadata":{}},{"output_type":"stream","text":"Test data: \n","name":"stdout"},{"output_type":"execute_result","execution_count":7,"data":{"text/plain":"type ImageID any epidural intraparenchymal intraventricular \\\n0 ID_000000e27.png 0.5 0.5 0.5 0.5 \n1 ID_000009146.png 0.5 0.5 0.5 0.5 \n2 ID_00007b8cb.png 0.5 0.5 0.5 0.5 \n3 ID_000134952.png 0.5 0.5 0.5 0.5 \n4 ID_000176f2a.png 0.5 0.5 0.5 0.5 \n\ntype subarachnoid subdural \n0 0.5 0.5 \n1 0.5 0.5 \n2 0.5 0.5 \n3 0.5 0.5 \n4 0.5 0.5 ","text/html":"<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th>type</th>\n <th>ImageID</th>\n <th>any</th>\n <th>epidural</th>\n <th>intraparenchymal</th>\n <th>intraventricular</th>\n <th>subarachnoid</th>\n <th>subdural</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <td>0</td>\n <td>ID_000000e27.png</td>\n <td>0.5</td>\n <td>0.5</td>\n <td>0.5</td>\n <td>0.5</td>\n <td>0.5</td>\n <td>0.5</td>\n </tr>\n <tr>\n <td>1</td>\n <td>ID_000009146.png</td>\n <td>0.5</td>\n <td>0.5</td>\n <td>0.5</td>\n <td>0.5</td>\n <td>0.5</td>\n <td>0.5</td>\n </tr>\n <tr>\n <td>2</td>\n <td>ID_00007b8cb.png</td>\n <td>0.5</td>\n <td>0.5</td>\n <td>0.5</td>\n <td>0.5</td>\n <td>0.5</td>\n <td>0.5</td>\n </tr>\n <tr>\n <td>3</td>\n <td>ID_000134952.png</td>\n <td>0.5</td>\n <td>0.5</td>\n <td>0.5</td>\n <td>0.5</td>\n <td>0.5</td>\n <td>0.5</td>\n </tr>\n <tr>\n <td>4</td>\n <td>ID_000176f2a.png</td>\n <td>0.5</td>\n <td>0.5</td>\n <td>0.5</td>\n <td>0.5</td>\n <td>0.5</td>\n <td>0.5</td>\n </tr>\n </tbody>\n</table>\n</div>"},"metadata":{}}]},{"metadata":{"_kg_hide-output":true,"trusted":true},"cell_type":"code","source":"# Save to CSV\ntrain_df.to_csv('clean_train_df.csv', index=False)\ntest_df.to_csv('clean_test_df.csv', index=False)","execution_count":8,"outputs":[]},{"metadata":{},"cell_type":"markdown","source":"## Creating submission file"},{"metadata":{"trusted":true},"cell_type":"code","source":"def create_submission_file(IDs, preds):\n \"\"\"\n Creates a submission file for Kaggle when given image ID's and predictions\n \n IDs: A list of all image IDs (Extensions will be cut off)\n preds: A list of lists containing all predictions for each image\n \n Returns a DataFrame that has the correct format for this competition\n \"\"\"\n sub_dict = {'ID': [], 'Label': []}\n # Create a row for each ID / Label combination\n for i, ID in enumerate(IDs):\n ID = ID.split('.')[0] # Remove extension such as .png\n sub_dict['ID'].extend([f\"{ID}_{target}\" for target in targets])\n sub_dict['Label'].extend(preds[i])\n return pd.DataFrame(sub_dict)","execution_count":9,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"# Finalize submission files\ntrain_sub_df = create_submission_file(train_df['ImageID'], train_df[targets].values)\ntest_sub_df = create_submission_file(test_df['ImageID'], test_df[targets].values)","execution_count":10,"outputs":[]},{"metadata":{"_kg_hide-input":true,"trusted":true},"cell_type":"code","source":"print('Back to the original submission format:')\ntrain_sub_df.head(6)","execution_count":11,"outputs":[{"output_type":"stream","text":"Back to the original submission format:\n","name":"stdout"},{"output_type":"execute_result","execution_count":11,"data":{"text/plain":" ID Label\n0 ID_000012eaf_epidural 0\n1 ID_000012eaf_intraparenchymal 0\n2 ID_000012eaf_intraventricular 0\n3 ID_000012eaf_subarachnoid 0\n4 ID_000012eaf_subdural 0\n5 ID_000012eaf_any 0","text/html":"<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>ID</th>\n <th>Label</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <td>0</td>\n <td>ID_000012eaf_epidural</td>\n <td>0</td>\n </tr>\n <tr>\n <td>1</td>\n <td>ID_000012eaf_intraparenchymal</td>\n <td>0</td>\n </tr>\n <tr>\n <td>2</td>\n <td>ID_000012eaf_intraventricular</td>\n <td>0</td>\n </tr>\n <tr>\n <td>3</td>\n <td>ID_000012eaf_subarachnoid</td>\n <td>0</td>\n </tr>\n <tr>\n <td>4</td>\n <td>ID_000012eaf_subdural</td>\n <td>0</td>\n </tr>\n <tr>\n <td>5</td>\n <td>ID_000012eaf_any</td>\n <td>0</td>\n </tr>\n </tbody>\n</table>\n</div>"},"metadata":{}}]}],"metadata":{"kernelspec":{"language":"python","display_name":"Python 3","name":"python3"},"language_info":{"pygments_lexer":"ipython3","nbconvert_exporter":"python","version":"3.6.4","file_extension":".py","codemirror_mode":{"name":"ipython","version":3},"name":"python","mimetype":"text/x-python"}},"nbformat":4,"nbformat_minor":4} \ No newline at end of file -- GitLab