流沙团
豆瓣电影爬虫脚本
2021-3-24 流沙团
import re
from urllib.request import urlopen, Request

def getPage(url):
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'}
ret = Request(url, headers=headers)
res = urlopen(ret)
return res.read().decode('utf-8')

def parsePage(s): # s 网页源码
ret = com.finditer(s)
for i in ret:
ret = {
"id": i.group("id"),
"title": i.group("title"),
"rating_num": i.group("rating_num"),
"comment_num": i.group("comment_num")
}
yield ret

def main(num):
url = 'https://movie.douban.com/top250?start=%s&filter=' % num # 0
response_html = getPage(url) # response_html是这个网页的源码 str
ret = parsePage(response_html) # 生成器
print(ret)
f = open("move_info7", "a", encoding="utf8")
for obj in ret:
print(obj)
data = str(obj)
f.write(data + "\n")
f.close()

com = re.compile(
'<div class="item">.*?<div class="pic">.*?<em .*?>(?P<id>\d+).*?<span class="title">(?P<title>.*?)</span>'
'.*?<span class="rating_num" .*?>(?P<rating_num>.*?)</span>.*?<span>(?P<comment_num>.*?)评价</span>', re.S)
count = 0
for i in range(10):
main(count) # count = 0
count += 25


常用伪装如下:



安装 fake-useragent



pip install fake-useragent



pip install -U fake-useragent







测试如下



from fake_useragent import UserAgent
ua = UserAgent()
#ie浏览器的user agent
print(ua.ie)

#opera浏览器
print(ua.opera)

#chrome浏览器
print(ua.chrome)

#firefox浏览器
print(ua.firefox)

#safri浏览器
print(ua.safari)


# 随意变换headers
print(ua.random)
print(ua.random)







豆瓣有反扒机制,需要实践测试

















发表评论:
昵称

邮件地址 (选填)

个人主页 (选填)

内容