- 综合讨论区
- 帖子详情
老师参与
用requests爬取百度图片的关键反反爬措施
周文扬
发表于2021年02月11日
<p><code class="brush:python;toolbar:false" >import re
import random
import requests
#很关键的反反爬措施,不做如此定义会导致爬取失败。
he={
'Host':'image.baidu.com',
'Connection':'keep-alive',
'Cache-Control':'max-age=0',
'Upgrade-Insecure-Requests':'1',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0;win64; x64)AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36',
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/siqned-exchange;v=b3;q=0.9'
}
def getBaiduPictures(gjc,n):#下载n个百度图片搜来的关于word的图片保存到本地
url = "https://image.baidu.com/search/index?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&fm=index&fr=&hs=0&xthttps=111111&sf=1&fmq=&pv=&ic=0&nc=1&z=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&word="
urljh = [url+i for i in gjc]
hqsj=[]
for u in urljh:
hqsj.append(requests.get(u,headers=he).text) #定义headers,反反爬措施。
for d in range(len(hqsj)):
r=re.findall('thumbURL":"(.*?)"',hqsj[d])[0:n]
for i in range(len(r)):
try:
p = requests.get(r[i], stream=True)
f = open('C:\\Users\\明月心\\temp\\{0}{1}{2}'.format(gjc[d],i,".jpg"), "wb") #"wb"表示二进制写方式打开文件
f.write(p.content) #图片内容写入文件
f.close()
except Exception as e :
pass
so = ["佳人","美景","牛"]
getBaiduPictures(so,20)</code></p>
3
回复