{
  "cells": [
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "import requests\n\npage = requests.get(\"http://dataquestio.github.io/web-scraping-pages/simple.html\")\npage",
      "execution_count": 1,
      "outputs": [
        {
          "output_type": "execute_result",
          "execution_count": 1,
          "data": {
            "text/plain": "<Response [200]>"
          },
          "metadata": {}
        }
      ]
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "page.status_code\n\n",
      "execution_count": 2,
      "outputs": [
        {
          "output_type": "execute_result",
          "execution_count": 2,
          "data": {
            "text/plain": "200"
          },
          "metadata": {}
        }
      ]
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "page.content\n",
      "execution_count": 3,
      "outputs": [
        {
          "output_type": "execute_result",
          "execution_count": 3,
          "data": {
            "text/plain": "b'<!DOCTYPE html>\\n<html>\\n    <head>\\n        <title>A simple example page</title>\\n    </head>\\n    <body>\\n        <p>Here is some simple content for this page.</p>\\n    </body>\\n</html>'"
          },
          "metadata": {}
        }
      ]
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "from bs4 import BeautifulSoup\nsoup = BeautifulSoup(page.content, 'html.parser')\n",
      "execution_count": 4,
      "outputs": []
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "print(soup.prettify())\n",
      "execution_count": 5,
      "outputs": [
        {
          "output_type": "stream",
          "text": "<!DOCTYPE html>\n<html>\n <head>\n  <title>\n   A simple example page\n  </title>\n </head>\n <body>\n  <p>\n   Here is some simple content for this page.\n  </p>\n </body>\n</html>\n",
          "name": "stdout"
        }
      ]
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "list(soup.children)\n",
      "execution_count": 6,
      "outputs": [
        {
          "output_type": "execute_result",
          "execution_count": 6,
          "data": {
            "text/plain": "['html', '\\n', <html>\n <head>\n <title>A simple example page</title>\n </head>\n <body>\n <p>Here is some simple content for this page.</p>\n </body>\n </html>]"
          },
          "metadata": {}
        }
      ]
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "[type(item) for item in list(soup.children)]\n",
      "execution_count": 7,
      "outputs": [
        {
          "output_type": "execute_result",
          "execution_count": 7,
          "data": {
            "text/plain": "[bs4.element.Doctype, bs4.element.NavigableString, bs4.element.Tag]"
          },
          "metadata": {}
        }
      ]
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "html = list(soup.children)[2]\n",
      "execution_count": 13,
      "outputs": []
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "list(html.children)\n",
      "execution_count": 15,
      "outputs": [
        {
          "output_type": "execute_result",
          "execution_count": 15,
          "data": {
            "text/plain": "['\\n', <head>\n <title>A simple example page</title>\n </head>, '\\n', <body>\n <p>Here is some simple content for this page.</p>\n </body>, '\\n']"
          },
          "metadata": {}
        }
      ]
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "body = list(html.children)[3]\n",
      "execution_count": 21,
      "outputs": []
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "list(body.children)\n",
      "execution_count": 22,
      "outputs": [
        {
          "output_type": "execute_result",
          "execution_count": 22,
          "data": {
            "text/plain": "['\\n', <p>Here is some simple content for this page.</p>, '\\n']"
          },
          "metadata": {}
        }
      ]
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "p = list(body.children)[1]",
      "execution_count": 23,
      "outputs": []
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "p.get_text()",
      "execution_count": 25,
      "outputs": [
        {
          "output_type": "execute_result",
          "execution_count": 25,
          "data": {
            "text/plain": "'Here is some simple content for this page.'"
          },
          "metadata": {}
        }
      ]
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "",
      "execution_count": null,
      "outputs": []
    }
  ],
  "metadata": {
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3",
      "language": "python"
    },
    "language_info": {
      "mimetype": "text/x-python",
      "nbconvert_exporter": "python",
      "name": "python",
      "file_extension": ".py",
      "version": "3.5.4",
      "pygments_lexer": "ipython3",
      "codemirror_mode": {
        "version": 3,
        "name": "ipython"
      }
    }
  },
  "nbformat": 4,
  "nbformat_minor": 2
}