From 2af5a32b989436f98fba82a8f871844c3ff9cde8 Mon Sep 17 00:00:00 2001 From: aslesha <cb.en.u4cse16259@cb.students.amrita.edu> Date: Sat, 2 Feb 2019 09:41:21 +0530 Subject: [PATCH] Upload New File --- Lab5/webscraping.ipynb | 229 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 229 insertions(+) create mode 100644 Lab5/webscraping.ipynb diff --git a/Lab5/webscraping.ipynb b/Lab5/webscraping.ipynb new file mode 100644 index 0000000..211ddc6 --- /dev/null +++ b/Lab5/webscraping.ipynb @@ -0,0 +1,229 @@ +{ + "cells": [ + { + "metadata": { + "trusted": true + }, + "cell_type": "code", + "source": "import requests\n\npage = requests.get(\"http://dataquestio.github.io/web-scraping-pages/simple.html\")\npage", + "execution_count": 1, + "outputs": [ + { + "output_type": "execute_result", + "execution_count": 1, + "data": { + "text/plain": "<Response [200]>" + }, + "metadata": {} + } + ] + }, + { + "metadata": { + "trusted": true + }, + "cell_type": "code", + "source": "page.status_code\n\n", + "execution_count": 2, + "outputs": [ + { + "output_type": "execute_result", + "execution_count": 2, + "data": { + "text/plain": "200" + }, + "metadata": {} + } + ] + }, + { + "metadata": { + "trusted": true + }, + "cell_type": "code", + "source": "page.content\n", + "execution_count": 3, + "outputs": [ + { + "output_type": "execute_result", + "execution_count": 3, + "data": { + "text/plain": "b'<!DOCTYPE html>\\n<html>\\n <head>\\n <title>A simple example page</title>\\n </head>\\n <body>\\n <p>Here is some simple content for this page.</p>\\n </body>\\n</html>'" + }, + "metadata": {} + } + ] + }, + { + "metadata": { + "trusted": true + }, + "cell_type": "code", + "source": "from bs4 import BeautifulSoup\nsoup = BeautifulSoup(page.content, 'html.parser')\n", + "execution_count": 4, + "outputs": [] + }, + { + "metadata": { + "trusted": true + }, + "cell_type": "code", + "source": "print(soup.prettify())\n", + "execution_count": 5, + "outputs": [ + { + "output_type": "stream", + "text": "<!DOCTYPE html>\n<html>\n <head>\n <title>\n A simple example page\n </title>\n </head>\n <body>\n <p>\n Here is some simple content for this page.\n </p>\n </body>\n</html>\n", + "name": "stdout" + } + ] + }, + { + "metadata": { + "trusted": true + }, + "cell_type": "code", + "source": "list(soup.children)\n", + "execution_count": 6, + "outputs": [ + { + "output_type": "execute_result", + "execution_count": 6, + "data": { + "text/plain": "['html', '\\n', <html>\n <head>\n <title>A simple example page</title>\n </head>\n <body>\n <p>Here is some simple content for this page.</p>\n </body>\n </html>]" + }, + "metadata": {} + } + ] + }, + { + "metadata": { + "trusted": true + }, + "cell_type": "code", + "source": "[type(item) for item in list(soup.children)]\n", + "execution_count": 7, + "outputs": [ + { + "output_type": "execute_result", + "execution_count": 7, + "data": { + "text/plain": "[bs4.element.Doctype, bs4.element.NavigableString, bs4.element.Tag]" + }, + "metadata": {} + } + ] + }, + { + "metadata": { + "trusted": true + }, + "cell_type": "code", + "source": "html = list(soup.children)[2]\n", + "execution_count": 13, + "outputs": [] + }, + { + "metadata": { + "trusted": true + }, + "cell_type": "code", + "source": "list(html.children)\n", + "execution_count": 15, + "outputs": [ + { + "output_type": "execute_result", + "execution_count": 15, + "data": { + "text/plain": "['\\n', <head>\n <title>A simple example page</title>\n </head>, '\\n', <body>\n <p>Here is some simple content for this page.</p>\n </body>, '\\n']" + }, + "metadata": {} + } + ] + }, + { + "metadata": { + "trusted": true + }, + "cell_type": "code", + "source": "body = list(html.children)[3]\n", + "execution_count": 21, + "outputs": [] + }, + { + "metadata": { + "trusted": true + }, + "cell_type": "code", + "source": "list(body.children)\n", + "execution_count": 22, + "outputs": [ + { + "output_type": "execute_result", + "execution_count": 22, + "data": { + "text/plain": "['\\n', <p>Here is some simple content for this page.</p>, '\\n']" + }, + "metadata": {} + } + ] + }, + { + "metadata": { + "trusted": true + }, + "cell_type": "code", + "source": "p = list(body.children)[1]", + "execution_count": 23, + "outputs": [] + }, + { + "metadata": { + "trusted": true + }, + "cell_type": "code", + "source": "p.get_text()", + "execution_count": 25, + "outputs": [ + { + "output_type": "execute_result", + "execution_count": 25, + "data": { + "text/plain": "'Here is some simple content for this page.'" + }, + "metadata": {} + } + ] + }, + { + "metadata": { + "trusted": true + }, + "cell_type": "code", + "source": "", + "execution_count": null, + "outputs": [] + } + ], + "metadata": { + "kernelspec": { + "name": "python3", + "display_name": "Python 3", + "language": "python" + }, + "language_info": { + "mimetype": "text/x-python", + "nbconvert_exporter": "python", + "name": "python", + "file_extension": ".py", + "version": "3.5.4", + "pygments_lexer": "ipython3", + "codemirror_mode": { + "version": 3, + "name": "ipython" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} \ No newline at end of file -- GitLab