"text/plain": "b'<!DOCTYPE html>\\n<html>\\n <head>\\n <title>A simple example page</title>\\n </head>\\n <body>\\n <p>Here is some simple content for this page.</p>\\n </body>\\n</html>'"
"text": "<!DOCTYPE html>\n<html>\n <head>\n <title>\n A simple example page\n </title>\n </head>\n <body>\n <p>\n Here is some simple content for this page.\n </p>\n </body>\n</html>\n",
"name": "stdout"
}
]
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "list(soup.children)\n",
"execution_count": 6,
"outputs": [
{
"output_type": "execute_result",
"execution_count": 6,
"data": {
"text/plain": "['html', '\\n', <html>\n <head>\n <title>A simple example page</title>\n </head>\n <body>\n <p>Here is some simple content for this page.</p>\n </body>\n </html>]"
},
"metadata": {}
}
]
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "[type(item) for item in list(soup.children)]\n",
"text/plain": "['\\n', <head>\n <title>A simple example page</title>\n </head>, '\\n', <body>\n <p>Here is some simple content for this page.</p>\n </body>, '\\n']"
},
"metadata": {}
}
]
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "body = list(html.children)[3]\n",
"execution_count": 21,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "list(body.children)\n",
"execution_count": 22,
"outputs": [
{
"output_type": "execute_result",
"execution_count": 22,
"data": {
"text/plain": "['\\n', <p>Here is some simple content for this page.</p>, '\\n']"
},
"metadata": {}
}
]
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "p = list(body.children)[1]",
"execution_count": 23,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "p.get_text()",
"execution_count": 25,
"outputs": [
{
"output_type": "execute_result",
"execution_count": 25,
"data": {
"text/plain": "'Here is some simple content for this page.'"
b'<!DOCTYPE html>\n<html>\n<head>\n<title>A simple example page</title>\n</head>\n<body>\n<p>Here is some simple content for this page.</p>\n</body>\n</html>'