From 32ccb7f870d4f6b11c13a072cec7d92a2e7fe3ac Mon Sep 17 00:00:00 2001
From: Raakesh <cb.en.p2aid19023@cb.students.amrita.edu>
Date: Thu, 2 Jul 2020 09:48:21 +0530
Subject: [PATCH] Preprocessing

---
 preprocessing-csv-rsna-ih-2019-19023.ipynb | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 preprocessing-csv-rsna-ih-2019-19023.ipynb

diff --git a/preprocessing-csv-rsna-ih-2019-19023.ipynb b/preprocessing-csv-rsna-ih-2019-19023.ipynb
new file mode 100644
index 0000000..2c7db41
--- /dev/null
+++ b/preprocessing-csv-rsna-ih-2019-19023.ipynb
@@ -0,0 +1 @@
+{"cells":[{"metadata":{},"cell_type":"markdown","source":"## Preprocessing CSV's for training"},{"metadata":{},"cell_type":"markdown","source":"![](https://www.rsna.org/-/media/Images/RSNA/Menu/logo_sml.ashx?w=100&la=en&hash=9619A8238B66C7BA9692C1FC3A5C9E97C24A06E1)"},{"metadata":{},"cell_type":"markdown","source":"Are you working a lot with Data Generators (for example Keras' \".flow_from_dataframe\") and competing in the [RSNA Intercranial Hemorrhage 2019 competition](https://www.kaggle.com/c/rsna-intracranial-hemorrhage-detection)? \n\nI've created a function that creates a simple preprocessed DataFrame with a column for ImageID and a column for each label in the competition. ('epidural', 'intraparenchymal', 'intraventricular', 'subarachnoid', 'subdural', 'any') \n\nI also made a function which translates your predictions into the correct submission format.\n\nIf you are interested in getting the metadata as CSV files also you can check out [this Kaggle kernel](https://www.kaggle.com/carlolepelaars/converting-dicom-metadata-to-csv-rsna-ihd-2019). \n\n"},{"metadata":{},"cell_type":"markdown","source":"## Preparation"},{"metadata":{"trusted":true},"cell_type":"code","source":"# We will only need OS and Pandas for this one\nimport os\nimport pandas as pd\n\n# Path names\nBASE_PATH = \"../input/rsna-intracranial-hemorrhage-detection/rsna-intracranial-hemorrhage-detection/\"\nTRAIN_PATH = BASE_PATH + 'stage_2_train.csv'\nTEST_PATH = BASE_PATH + 'stage_2_sample_submission.csv'\n\n# All labels that we have to predict in this competition\ntargets = ['epidural', 'intraparenchymal', \n           'intraventricular', 'subarachnoid', \n           'subdural', 'any']","execution_count":1,"outputs":[]},{"metadata":{"_kg_hide-input":true,"trusted":true},"cell_type":"code","source":"# File sizes and specifications\nprint('\\n# Files and file sizes')\nfor file in os.listdir(BASE_PATH)[2:]:\n    print('{}| {} MB'.format(file.ljust(30), \n                             str(round(os.path.getsize(BASE_PATH + file) / 1000000, 2))))","execution_count":2,"outputs":[{"output_type":"stream","text":"\n# Files and file sizes\nstage_2_train                 | 26.59 MB\nstage_2_train.csv             | 119.7 MB\n","name":"stdout"}]},{"metadata":{},"cell_type":"markdown","source":"## Preprocessing CSV's"},{"metadata":{"trusted":true},"cell_type":"code","source":"train_df = pd.read_csv(TRAIN_PATH)\ntrain_df['ImageID'] = train_df['ID'].str.rsplit('_', 1).map(lambda x: x[0]) + '.png'\nlabel_lists = train_df.groupby('ImageID')['Label'].apply(list)","execution_count":3,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"train_df[train_df['ImageID'] == 'ID_0002081b6.png']","execution_count":4,"outputs":[{"output_type":"execute_result","execution_count":4,"data":{"text/plain":"                                   ID  Label           ImageID\n770232          ID_0002081b6_epidural      0  ID_0002081b6.png\n770233  ID_0002081b6_intraparenchymal      1  ID_0002081b6.png\n770234  ID_0002081b6_intraventricular      0  ID_0002081b6.png\n770235      ID_0002081b6_subarachnoid      0  ID_0002081b6.png\n770236          ID_0002081b6_subdural      0  ID_0002081b6.png\n770237               ID_0002081b6_any      1  ID_0002081b6.png","text/html":"<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>ID</th>\n      <th>Label</th>\n      <th>ImageID</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <td>770232</td>\n      <td>ID_0002081b6_epidural</td>\n      <td>0</td>\n      <td>ID_0002081b6.png</td>\n    </tr>\n    <tr>\n      <td>770233</td>\n      <td>ID_0002081b6_intraparenchymal</td>\n      <td>1</td>\n      <td>ID_0002081b6.png</td>\n    </tr>\n    <tr>\n      <td>770234</td>\n      <td>ID_0002081b6_intraventricular</td>\n      <td>0</td>\n      <td>ID_0002081b6.png</td>\n    </tr>\n    <tr>\n      <td>770235</td>\n      <td>ID_0002081b6_subarachnoid</td>\n      <td>0</td>\n      <td>ID_0002081b6.png</td>\n    </tr>\n    <tr>\n      <td>770236</td>\n      <td>ID_0002081b6_subdural</td>\n      <td>0</td>\n      <td>ID_0002081b6.png</td>\n    </tr>\n    <tr>\n      <td>770237</td>\n      <td>ID_0002081b6_any</td>\n      <td>1</td>\n      <td>ID_0002081b6.png</td>\n    </tr>\n  </tbody>\n</table>\n</div>"},"metadata":{}}]},{"metadata":{"_uuid":"d629ff2d2480ee46fbb7e2d37f6b5fab8052498a","_cell_guid":"79c7e3d0-c299-4dcb-8224-4455121ee9b0","trusted":true},"cell_type":"code","source":"def prepare_df(path, train=False, nrows=None):\n    \"\"\"\n    Prepare Pandas DataFrame for fitting neural network models\n    Returns a Dataframe with two columns\n    ImageID and Labels (list of all labels for an image)\n    \"\"\" \n    df = pd.read_csv(path, nrows=nrows)\n    \n    # Get ImageID and type for pivoting\n    df['ImageID'] = df['ID'].str.rsplit('_', 1).map(lambda x: x[0]) + '.png'\n    df['type'] = df['ID'].str.split(\"_\", n = 3, expand = True)[2]\n    # Create new DataFrame by pivoting\n    new_df = df[['Label', 'ImageID', 'type']].drop_duplicates().pivot(index='ImageID', \n                                                                      columns='type', \n                                                                      values='Label').reset_index()\n    return new_df","execution_count":5,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"# Convert dataframes to preprocessed format\ntrain_df = prepare_df(TRAIN_PATH, train=True)\ntest_df = prepare_df(TEST_PATH)","execution_count":6,"outputs":[]},{"metadata":{"_kg_hide-input":true,"trusted":true},"cell_type":"code","source":"print('Training data: ')\ndisplay(train_df.head())\n\nprint('Test data: ')\ntest_df.head()","execution_count":7,"outputs":[{"output_type":"stream","text":"Training data: \n","name":"stdout"},{"output_type":"display_data","data":{"text/plain":"type           ImageID  any  epidural  intraparenchymal  intraventricular  \\\n0     ID_000012eaf.png    0         0                 0                 0   \n1     ID_000039fa0.png    0         0                 0                 0   \n2     ID_00005679d.png    0         0                 0                 0   \n3     ID_00008ce3c.png    0         0                 0                 0   \n4     ID_0000950d7.png    0         0                 0                 0   \n\ntype  subarachnoid  subdural  \n0                0         0  \n1                0         0  \n2                0         0  \n3                0         0  \n4                0         0  ","text/html":"<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th>type</th>\n      <th>ImageID</th>\n      <th>any</th>\n      <th>epidural</th>\n      <th>intraparenchymal</th>\n      <th>intraventricular</th>\n      <th>subarachnoid</th>\n      <th>subdural</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <td>0</td>\n      <td>ID_000012eaf.png</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <td>1</td>\n      <td>ID_000039fa0.png</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <td>2</td>\n      <td>ID_00005679d.png</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <td>3</td>\n      <td>ID_00008ce3c.png</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <td>4</td>\n      <td>ID_0000950d7.png</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n    </tr>\n  </tbody>\n</table>\n</div>"},"metadata":{}},{"output_type":"stream","text":"Test data: \n","name":"stdout"},{"output_type":"execute_result","execution_count":7,"data":{"text/plain":"type           ImageID  any  epidural  intraparenchymal  intraventricular  \\\n0     ID_000000e27.png  0.5       0.5               0.5               0.5   \n1     ID_000009146.png  0.5       0.5               0.5               0.5   \n2     ID_00007b8cb.png  0.5       0.5               0.5               0.5   \n3     ID_000134952.png  0.5       0.5               0.5               0.5   \n4     ID_000176f2a.png  0.5       0.5               0.5               0.5   \n\ntype  subarachnoid  subdural  \n0              0.5       0.5  \n1              0.5       0.5  \n2              0.5       0.5  \n3              0.5       0.5  \n4              0.5       0.5  ","text/html":"<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th>type</th>\n      <th>ImageID</th>\n      <th>any</th>\n      <th>epidural</th>\n      <th>intraparenchymal</th>\n      <th>intraventricular</th>\n      <th>subarachnoid</th>\n      <th>subdural</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <td>0</td>\n      <td>ID_000000e27.png</td>\n      <td>0.5</td>\n      <td>0.5</td>\n      <td>0.5</td>\n      <td>0.5</td>\n      <td>0.5</td>\n      <td>0.5</td>\n    </tr>\n    <tr>\n      <td>1</td>\n      <td>ID_000009146.png</td>\n      <td>0.5</td>\n      <td>0.5</td>\n      <td>0.5</td>\n      <td>0.5</td>\n      <td>0.5</td>\n      <td>0.5</td>\n    </tr>\n    <tr>\n      <td>2</td>\n      <td>ID_00007b8cb.png</td>\n      <td>0.5</td>\n      <td>0.5</td>\n      <td>0.5</td>\n      <td>0.5</td>\n      <td>0.5</td>\n      <td>0.5</td>\n    </tr>\n    <tr>\n      <td>3</td>\n      <td>ID_000134952.png</td>\n      <td>0.5</td>\n      <td>0.5</td>\n      <td>0.5</td>\n      <td>0.5</td>\n      <td>0.5</td>\n      <td>0.5</td>\n    </tr>\n    <tr>\n      <td>4</td>\n      <td>ID_000176f2a.png</td>\n      <td>0.5</td>\n      <td>0.5</td>\n      <td>0.5</td>\n      <td>0.5</td>\n      <td>0.5</td>\n      <td>0.5</td>\n    </tr>\n  </tbody>\n</table>\n</div>"},"metadata":{}}]},{"metadata":{"_kg_hide-output":true,"trusted":true},"cell_type":"code","source":"# Save to CSV\ntrain_df.to_csv('clean_train_df.csv', index=False)\ntest_df.to_csv('clean_test_df.csv', index=False)","execution_count":8,"outputs":[]},{"metadata":{},"cell_type":"markdown","source":"## Creating submission file"},{"metadata":{"trusted":true},"cell_type":"code","source":"def create_submission_file(IDs, preds):\n    \"\"\"\n    Creates a submission file for Kaggle when given image ID's and predictions\n    \n    IDs: A list of all image IDs (Extensions will be cut off)\n    preds: A list of lists containing all predictions for each image\n    \n    Returns a DataFrame that has the correct format for this competition\n    \"\"\"\n    sub_dict = {'ID': [], 'Label': []}\n    # Create a row for each ID / Label combination\n    for i, ID in enumerate(IDs):\n        ID = ID.split('.')[0] # Remove extension such as .png\n        sub_dict['ID'].extend([f\"{ID}_{target}\" for target in targets])\n        sub_dict['Label'].extend(preds[i])\n    return pd.DataFrame(sub_dict)","execution_count":9,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"# Finalize submission files\ntrain_sub_df = create_submission_file(train_df['ImageID'], train_df[targets].values)\ntest_sub_df = create_submission_file(test_df['ImageID'], test_df[targets].values)","execution_count":10,"outputs":[]},{"metadata":{"_kg_hide-input":true,"trusted":true},"cell_type":"code","source":"print('Back to the original submission format:')\ntrain_sub_df.head(6)","execution_count":11,"outputs":[{"output_type":"stream","text":"Back to the original submission format:\n","name":"stdout"},{"output_type":"execute_result","execution_count":11,"data":{"text/plain":"                              ID  Label\n0          ID_000012eaf_epidural      0\n1  ID_000012eaf_intraparenchymal      0\n2  ID_000012eaf_intraventricular      0\n3      ID_000012eaf_subarachnoid      0\n4          ID_000012eaf_subdural      0\n5               ID_000012eaf_any      0","text/html":"<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>ID</th>\n      <th>Label</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <td>0</td>\n      <td>ID_000012eaf_epidural</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <td>1</td>\n      <td>ID_000012eaf_intraparenchymal</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <td>2</td>\n      <td>ID_000012eaf_intraventricular</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <td>3</td>\n      <td>ID_000012eaf_subarachnoid</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <td>4</td>\n      <td>ID_000012eaf_subdural</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <td>5</td>\n      <td>ID_000012eaf_any</td>\n      <td>0</td>\n    </tr>\n  </tbody>\n</table>\n</div>"},"metadata":{}}]}],"metadata":{"kernelspec":{"language":"python","display_name":"Python 3","name":"python3"},"language_info":{"pygments_lexer":"ipython3","nbconvert_exporter":"python","version":"3.6.4","file_extension":".py","codemirror_mode":{"name":"ipython","version":3},"name":"python","mimetype":"text/x-python"}},"nbformat":4,"nbformat_minor":4}
\ No newline at end of file
-- 
GitLab