爬虫学习3——BeautifulSoup_unijhql的技术博客_

link之家

链接快照平台

输入网页链接，自动生成快照
标签化管理网页链接

相关文章推荐

活泼的奔马 · javascript - Chrome ...· 9 月前 ·

失恋的牙膏 · 王石磊任中国铝业集团有限公司董事、总经理、党 ...· 10 月前 ·

逆袭的登山鞋 · 这个妲己不仅能娱乐，还能陪你玩王者，你不来看 ...· 1 年前 ·

强健的企鹅 · 奇点汽车冲击科创板：沈海寅的殊死一搏_搜狐汽 ...· 1 年前 ·

爱听歌的刺猬 · 【江淮iEVA50】-江淮iEVA50报价- ...· 1 年前 ·

Once upon a time there were many children and their names were Elsie, Lacie and Tillie; Bob; King; Mary; Target said：“I am very strong”; Jack; and they lived at the bottom of a well.

...

bs = BeautifulSoup(html,'lxml') #使用lmxl进行解析html doc = bs.prettify #把代码格式化输出 doc = bs.title.string #获取title标签的内容 doc = bs.title.text #同样是获取title标签的内容 doc = bs.a.text #获取a标签的内容 doc = bs.a.string #获取a标签的内容 doc = bs.title #获取title标签 doc = bs.head #获取出head标签 doc = bs.body.a #获取body下的a标签，但是只是打印第一个，镶嵌选择 doc = bs.p['class'] #获取p标签的属性class属性 doc = bs.find_all('a') #获取所有的a标签 doc = bs.find('a') #查找a标签，只是返回查找的第一个 doc = bs.a.parent #获取a标签的父标签 doc = bs.a.parents #获取a标签的祖先标签 #print(type(doc)) #祖先标签是generator类型，通过for循环打印 #for item in doc: # print(item) doc = bs.a.next_sibling #获取a标签的下一个兄弟节点 #print(doc) #for item in doc: 获取所有兄弟节点靠for循环输出 # print(item) #find_next_silbings() 返回后面的所有兄弟标签 #find_previous_sibilings() 返回前面的所有兄弟标签 #find_next_silbing() 返回后面的第一个兄弟标签 #find_previous_sibiling() 返回前面的第一个兄弟标签 doc = bs.find_all('a') #查找所有的a标签 doc = bs.find_all(attrs={'id':'link1'}) #通过属性查找所有的标签 doc = bs.find_all(attrs={'id':'link3'}) #通过属性查找所有的标签 doc = bs.find_all(id='link3') #通过id直接查找，而不是通过字典查找 doc = bs.find_all(class_='brother') #class后面有个 _ doc = bs.find_all(text='Target') #根据文本内容查找，文本内容必须要完全匹配才能查找上，这个就找不到 doc = bs.find_all(text='Bob') #这个能查找上 #find_all_next() 返回节点后所有符合条件的节点 #find_next() 返回节点后第一个符合条件的节点 doc = bs.select('#link3') #这里的select是bs中内置的css选择器，可以直接通过css选择 doc = bs.select('.brother') #通过id进行查找 doc = bs.select('p a') #获取p标签下的所有a标签 doc = bs.select('a') #获取所有的a标签，并输出每个的href的属性内容，需要通过for循环输出 #for item in doc: # print(item['href']) doc = bs.select('a') #获取所有的a标签，并输出每个标签的内容，需要通过for循环输出 #for item in doc: # print(item.text)

实战：爬去豆瓣上的指定的电影信息：

import requests
from bs4 import BeautifulSoup
from urllib.parse import quote
import re
def write_info(head,body):   #将获得信息写入txt
		with open('moveinfo.txt','a',encoding='utf-8') as f:
				f.write(head+body+'\n\n')
				f.close()
def get_info(type,url,name):   #获取演员，上映时间...等详细信息
		response = requests.get(url).text
		doc = BeautifulSoup(response,'lxml')
		info = doc.select('#info')       #查找到info标签
		for item in info:
				head = type + ":" + name
				print('正在加载 ',type,":","《" +name+ "》",' 信息......')
				print(head)
				item = item.text
				write_info(head,item)
def get_url(search):
		url = 'https://www.douban.com/search?cat=1002&q=' + quote(search)  #对搜索的内容进行编码
		response = requests.get(url)   #发起请求
		doc = BeautifulSoup(response.text,'lxml')  #使用bs进行解析
		doc = doc.find_all('h3')
		doc = str(doc)
		pattern = re.compile('<h3>.*?<span>\[(.*?)\]</spa.*?href="(.*?)".*?target.*?>(.*?)</a>',re.S)
		result = re.findall(pattern,doc)
		for item in result:
			print('====================================================')
			get_info(item[0],item[1],item[2])
if __name__ == '__main__':
		search = input('请输入要收集的电影信息：')    
		get_url(search)