百度图片搜索爬虫提效作业完成-只打开一次浏览器,打开多个页面

周文扬 发表于2021年02月11日
<p>import re</p><p>import random</p><p>import requests #request库用于获取网络资源pip install requests</p><p><br ></p><p>#反爬措施</p><p>user_agents = ['Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20130406 Firefox/23.0',&nbsp; &nbsp;\</p><p>'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:18.0) Gecko/20100101 Firefox/18.0', \</p><p>&nbsp; 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/533+ (KHTML, like Gecko) Element Browser 5.0', \</p><p>&nbsp; 'IBM WebExplorer /v0.94', \</p><p>&nbsp; 'Galaxy/1.0 [en] (Mac OS X 10.5.6; U; en)', \</p><p>&nbsp; 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)', \</p><p>&nbsp; 'Opera/9.80 (Windows NT 6.0) Presto/2.12.388 Version/12.14', \</p><p>&nbsp; 'Mozilla/5.0 (iPad; CPU OS 6_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/6.0 Mobile/10A5355d Safari/8536.25', \</p><p>&nbsp; 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1468.0 Safari/537.36', \</p><p>&nbsp; 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0; Trident/5.0; TheWorld)']</p><p>r=random.randint(0,9)</p><p>fakeHeaders=user_agents[r]</p><p><br ></p><p><br ></p><p>browser='' #定义一个全局变量,以配合完成只启动一个浏览器的功能。</p><p><br ></p><p>#用pyppeteer爬取网页</p><p>def getHtml(urllist): #暂时适用于百度图片搜索 </p><p> dalb=[]</p><p> import asyncio #Python 3.6之后自带的协程库&nbsp;</p><p> import pyppeteer as pyp</p><p> </p><p> for url in urllist:</p><p> async def asGetHtml(url): #获取url对应网页的源代码&nbsp;&nbsp;</p><p><br ></p><p> #如下三行代码是完成在启动一个浏览器的前提下打开多个网页的功能</p><p> global browser&nbsp;&nbsp;</p><p> if urllist.index(url)==0: </p><p> browser = await pyp.launch(headless=False) # 启动Chromium,browser即为Chromium浏览器,非隐藏启动 </p><p> </p><p> page = await browser.newPage() # 在浏览器中打开一个新页面</p><p> await page.setUserAgent(fakeHeaders) #反反爬措施</p><p> await page.evaluateOnNewDocument(</p><p> '() =&gt;{ Object.defineProperties(navigator,\</p><p> { webdriver:{ get:()=&gt; false }})}')#反反爬措施</p><p><br ></p><p> #用pyppeteer获取网页</p><p> await page.goto(url) # 装入url对应的网页</p><p> text = await page.content() # page.coutent就是网页源代码字符串&nbsp;</p><p><br ></p><p> if urllist.index(url)==len(urllist)-1:&nbsp; #当爬完最后一个关键词,关闭网页。</p><p> await browser.close() # 关闭浏览器</p><p><br ></p><p> return text&nbsp; #速度大约比用requests.get慢5,6倍</p><p><br ></p><p> m = asyncio.ensure_future(asGetHtml(url)) #协程外启动协程&nbsp;&nbsp;</p><p> asyncio.get_event_loop().run_until_complete(m) #等待协程结束&nbsp;&nbsp;</p><p> dalb.append(m.result())</p><p><br ></p><p> return dalb</p><p><br ></p><p><br ></p><p><br ></p><p>def getBaiduPictures(gjc,n):#下载n个百度图片搜来的关于word的图片保存到本地</p><p> url = &quot;https://image.baidu.com/search/index?tn=baiduimage&amp;ipn=r&amp;ct=201326592&amp;cl=2&amp;lm=-1&amp;st=-1&amp;fm=index&amp;fr=&amp;hs=0&amp;xthttps=111111&amp;sf=1&amp;fmq=&amp;pv=&amp;ic=0&amp;nc=1&amp;z=&amp;se=1&amp;showtab=0&amp;fb=0&amp;width=&amp;height=&amp;face=0&amp;istype=2&amp;ie=utf-8&amp;word=&quot;</p><p> urljh = [url+i for i in gjc]</p><p> hqsj = getHtml(urljh)</p><p> for d in range(len(hqsj)):</p><p> html = hqsj[d]</p><p> r=re.findall('thumbURL&quot;:&quot;(.*?)&quot;',html)[0:n]</p><p><br ></p><p> for i in range(n): </p><p> x = r[i]</p><p> if not (x.lower().endswith(&quot;.jpg&quot;) or x.lower().endswith(&quot;.jpeg&quot;) or x.lower().endswith(&quot;.png&quot;)):</p><p> continue#只获取后缀名是.jpg或.png的图片文 </p><p> try:</p><p> pos = x.rfind(&quot;.&quot;)</p><p> p = requests.get(x, stream=True)</p><p> f = open('C:\\Users\\明月心\\temp\\{0}{1}{2}'.format(gjc[d],i,x[pos:]), &quot;wb&quot;) #&quot;wb&quot;表示二进制写方式打开文件</p><p> f.write(p.content)&nbsp; #图片内容写入文件</p><p> f.close()</p><p> except Exception as e :</p><p> pass</p><p><br ></p><p><br ></p><p>so = [&quot;佳人&quot;,&quot;美景&quot;,&quot;牛&quot;]&nbsp;&nbsp;#定义多个搜索关键词</p><p>getBaiduPictures(so,10) #获取各关键词搜索图片结果的前十张</p><p><br ></p><p><br ></p><p><br ></p>