{ "cells": [ { "metadata": { "trusted": true }, "cell_type": "code", "source": "import requests\n\npage = requests.get(\"http://dataquestio.github.io/web-scraping-pages/simple.html\")\npage", "execution_count": 1, "outputs": [ { "output_type": "execute_result", "execution_count": 1, "data": { "text/plain": "<Response [200]>" }, "metadata": {} } ] }, { "metadata": { "trusted": true }, "cell_type": "code", "source": "page.status_code\n\n", "execution_count": 2, "outputs": [ { "output_type": "execute_result", "execution_count": 2, "data": { "text/plain": "200" }, "metadata": {} } ] }, { "metadata": { "trusted": true }, "cell_type": "code", "source": "page.content\n", "execution_count": 3, "outputs": [ { "output_type": "execute_result", "execution_count": 3, "data": { "text/plain": "b'<!DOCTYPE html>\\n<html>\\n <head>\\n <title>A simple example page</title>\\n </head>\\n <body>\\n <p>Here is some simple content for this page.</p>\\n </body>\\n</html>'" }, "metadata": {} } ] }, { "metadata": { "trusted": true }, "cell_type": "code", "source": "from bs4 import BeautifulSoup\nsoup = BeautifulSoup(page.content, 'html.parser')\n", "execution_count": 4, "outputs": [] }, { "metadata": { "trusted": true }, "cell_type": "code", "source": "print(soup.prettify())\n", "execution_count": 5, "outputs": [ { "output_type": "stream", "text": "<!DOCTYPE html>\n<html>\n <head>\n <title>\n A simple example page\n </title>\n </head>\n <body>\n <p>\n Here is some simple content for this page.\n </p>\n </body>\n</html>\n", "name": "stdout" } ] }, { "metadata": { "trusted": true }, "cell_type": "code", "source": "list(soup.children)\n", "execution_count": 6, "outputs": [ { "output_type": "execute_result", "execution_count": 6, "data": { "text/plain": "['html', '\\n', <html>\n <head>\n <title>A simple example page</title>\n </head>\n <body>\n <p>Here is some simple content for this page.</p>\n </body>\n </html>]" }, "metadata": {} } ] }, { "metadata": { "trusted": true }, "cell_type": "code", "source": "[type(item) for item in list(soup.children)]\n", "execution_count": 7, "outputs": [ { "output_type": "execute_result", "execution_count": 7, "data": { "text/plain": "[bs4.element.Doctype, bs4.element.NavigableString, bs4.element.Tag]" }, "metadata": {} } ] }, { "metadata": { "trusted": true }, "cell_type": "code", "source": "html = list(soup.children)[2]\n", "execution_count": 13, "outputs": [] }, { "metadata": { "trusted": true }, "cell_type": "code", "source": "list(html.children)\n", "execution_count": 15, "outputs": [ { "output_type": "execute_result", "execution_count": 15, "data": { "text/plain": "['\\n', <head>\n <title>A simple example page</title>\n </head>, '\\n', <body>\n <p>Here is some simple content for this page.</p>\n </body>, '\\n']" }, "metadata": {} } ] }, { "metadata": { "trusted": true }, "cell_type": "code", "source": "body = list(html.children)[3]\n", "execution_count": 21, "outputs": [] }, { "metadata": { "trusted": true }, "cell_type": "code", "source": "list(body.children)\n", "execution_count": 22, "outputs": [ { "output_type": "execute_result", "execution_count": 22, "data": { "text/plain": "['\\n', <p>Here is some simple content for this page.</p>, '\\n']" }, "metadata": {} } ] }, { "metadata": { "trusted": true }, "cell_type": "code", "source": "p = list(body.children)[1]", "execution_count": 23, "outputs": [] }, { "metadata": { "trusted": true }, "cell_type": "code", "source": "p.get_text()", "execution_count": 25, "outputs": [ { "output_type": "execute_result", "execution_count": 25, "data": { "text/plain": "'Here is some simple content for this page.'" }, "metadata": {} } ] }, { "metadata": { "trusted": true }, "cell_type": "code", "source": "", "execution_count": null, "outputs": [] } ], "metadata": { "kernelspec": { "name": "python3", "display_name": "Python 3", "language": "python" }, "language_info": { "mimetype": "text/x-python", "nbconvert_exporter": "python", "name": "python", "file_extension": ".py", "version": "3.5.4", "pygments_lexer": "ipython3", "codemirror_mode": { "version": 3, "name": "ipython" } } }, "nbformat": 4, "nbformat_minor": 2 }