html = """ <html><head><title>The Dormouse's story</title></head> <body> <p class="title" name="dromouse"><b>The Dormouse's story</b></p> <p class="story">Once upon a time there were three little sisters; and their names were <a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>, <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; and they lived at the bottom of a well.</p> <p class="story">...</p> """ # 创建一个名为soup的BeautifulSoup对象 soup = BeautifulSoup(html, 'lxml')
1 2
# 按照标准的缩进格式的结构输出 print(soup.prettify())
1 2
# 获取标题 print(soup.title)
<title>The Dormouse’s story</title>
1 2
# 获取标题文本 print(soup.title.text)
The Dormouse’s story
1 2
# 获取所有文字内容 print(soup.get_text())
1 2 3 4 5 6 7 8 9
The Dormouse's story
The Dormouse's story Once upon a time there were three little sisters; and their names were , Lacie and Tillie; and they lived at the bottom of a well. ...