用requests爬取百度图片的关键反反爬措施

周文扬发表于2021年02月11日

<code class="brush:python;toolbar:false" >import re import random import requests #很关键的反反爬措施，不做如此定义会导致爬取失败。 he={ 'Host':'image.baidu.com', 'Connection':'keep-alive', 'Cache-Control':'max-age=0', 'Upgrade-Insecure-Requests':'1', 'User-Agent':'Mozilla/5.0 (Windows NT 10.0;win64; x64)AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36', 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/siqned-exchange;v=b3;q=0.9' } def getBaiduPictures(gjc,n):#下载n个百度图片搜来的关于word的图片保存到本地 url = "https://image.baidu.com/search/index?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&fm=index&fr=&hs=0&xthttps=111111&sf=1&fmq=&pv=&ic=0&nc=1&z=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&word=" urljh = [url+i for i in gjc] hqsj=[] for u in urljh: hqsj.append(requests.get(u,headers=he).text)  #定义headers，反反爬措施。 for d in range(len(hqsj)): r=re.findall('thumbURL":"(.*?)"',hqsj[d])[0:n] for i in range(len(r)): try: p = requests.get(r[i], stream=True) f = open('C:\\Users\\明月心\\temp\\{0}{1}{2}'.format(gjc[d],i,".jpg"), "wb") #"wb"表示二进制写方式打开文件 f.write(p.content)  #图片内容写入文件 f.close() except Exception as e : pass so = ["佳人","美景","牛"] getBaiduPictures(so,20)</code>