2月27日最勤奋学员榜单

周文扬 发表于2021年02月27日
<p><img src="https://nos.netease.com/edu-image/4c3b9c0db0854f6ab5ca56e3e10fc342.jpg" /></p><p><br /></p><p><code class="brush:python;toolbar:false" >#用pyppeteer库模拟登陆OJ网站后,结合requests库进行快速获取数据的爬虫程序。 import&nbsp;asyncio&nbsp; import&nbsp;pyppeteer&nbsp;as&nbsp;pyp&nbsp; import&nbsp;bs4&nbsp; import&nbsp;requests&nbsp; import&nbsp;re import&nbsp;datetime rq=datetime.date.today() def&nbsp;sessionGetHtml(session,url):&nbsp;#发送带session的网页请求&nbsp; fakeHeaders&nbsp;=&nbsp;{&nbsp;'User-Agent':'Mozilla/5.0&nbsp;(Windows&nbsp;NT&nbsp;10.0;&nbsp;Win64;&nbsp;x64)&nbsp;\ AppleWebKit/537.36&nbsp;(KHTML,&nbsp;like&nbsp;Gecko)&nbsp;Chrome/81.0.4044.138&nbsp;Safari/537.36&nbsp;Edg/81.0.416.77'&nbsp;}&nbsp;&nbsp;#&nbsp;伪装浏览器用的请求头&nbsp; try:&nbsp; result&nbsp;=&nbsp;session.get(url,headers&nbsp;=&nbsp;fakeHeaders)&nbsp; result.encoding&nbsp;=&nbsp;result.apparent_encoding&nbsp; return&nbsp;result.text&nbsp; except&nbsp;Exception&nbsp;as&nbsp;e:&nbsp; print(e)&nbsp; return&nbsp;&quot;&quot;&nbsp; async&nbsp;def&nbsp;makeSession(page):&nbsp;#&nbsp;返回一个session,将其内部cookies修改成pypeteer浏览器页面对象中的cookies&nbsp; cookies&nbsp;=&nbsp;await&nbsp;page.cookies()&nbsp;#cookies是一个列表,每个元素都是一个字典&nbsp; cookies1&nbsp;=&nbsp;{}&nbsp; for&nbsp;cookie&nbsp;in&nbsp;cookies:&nbsp;&nbsp;#&nbsp;requests中的cookies只要&nbsp;&quot;name&quot;属性&nbsp; cookies1[cookie['name']]&nbsp;=&nbsp;cookie['value']&nbsp; session&nbsp;=&nbsp;requests.Session()&nbsp; session.cookies.update(cookies1)&nbsp; return&nbsp;session&nbsp; async&nbsp;def&nbsp;antiAntiCrawler(page):&nbsp;#为page添加反反爬虫手段&nbsp; await&nbsp;page.setUserAgent('Mozilla/5.0&nbsp;(Windows&nbsp;NT&nbsp;6.1;&nbsp;Win64;&nbsp;x64)&nbsp;\ '&nbsp;'AppleWebKit/537.36&nbsp;(KHTML,&nbsp;like&nbsp;Gecko)&nbsp;'&nbsp;'Chrome/78.0.3904.70&nbsp;Safari/537.36')&nbsp; await&nbsp;page.evaluateOnNewDocument(&nbsp;'()&nbsp;=&gt;{&nbsp;Object.defineProperties(navigator,'&nbsp;'{&nbsp;webdriver:{&nbsp;get:&nbsp;()&nbsp;=&gt;&nbsp;false&nbsp;}&nbsp;})&nbsp;}') def&nbsp;repc(u): he={ 'User-Agent':'Mozilla/5.0&nbsp;(Windows&nbsp;NT&nbsp;10.0;win64;&nbsp;x64)AppleWebKit/537.36&nbsp;(KHTML,&nbsp;like&nbsp;Gecko)&nbsp;Chrome/87.0.4280.88&nbsp;Safari/537.36', 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/siqned-exchange;v=b3;q=0.9' } return&nbsp;requests.get(u,headers=he).text ylb=[] async&nbsp;def&nbsp;getOjSourceCode(loginUrl):&nbsp; global&nbsp;ylb width,&nbsp;height&nbsp;=&nbsp;1400,&nbsp;800&nbsp;&nbsp;#网页宽高&nbsp; browser&nbsp;=&nbsp;await&nbsp;pyp.launch(headless=False,&nbsp;userdataDir&nbsp;=&nbsp;&quot;c:/tmp&quot;,&nbsp;args=[f'--window-size={width},{height}'])&nbsp; page&nbsp;=&nbsp;await&nbsp;browser.newPage()&nbsp; await&nbsp;antiAntiCrawler(page)&nbsp; await&nbsp;page.setViewport({'width':&nbsp;width,&nbsp;'height':&nbsp;height})&nbsp; await&nbsp;page.goto(loginUrl)&nbsp; await&nbsp;page.waitForSelector(&quot;#main&gt;h2&quot;,&nbsp;timeout=30000)&nbsp;#等待手动登录后,“正在进行的比赛....&quot;标题出现&nbsp; zsj=[] js=1 while&nbsp;1: u=&quot;https://cxsjsxmooc.openjudge.cn/2021pyspring/status/?page=&quot;+str(js) html=repc(u) zt=&quot;昨天&lt;/abbr&gt;&quot;&nbsp;&nbsp;#只统计今天的数据,当网页源码中出现&quot;昨天&quot;时中止循环。 if&nbsp;len(re.findall(zt,html))&gt;0: break xz='group-216/&quot;&gt;(.+)&lt;/a&gt;'&nbsp;#寻找用户名的正则表达式 n=re.findall(xz,html) zsj&nbsp;+=&nbsp;n js&nbsp;+=&nbsp;1 #对用户当天做题量进行统计排序 zd={} for&nbsp;i&nbsp;in&nbsp;set(zsj): zd[i]&nbsp;=0 for&nbsp;i&nbsp;in&nbsp;zsj: zd[i]&nbsp;=&nbsp;zd.get(i)+1 zd=sorted(zd.items(),&nbsp;key=lambda&nbsp;item:item[1],&nbsp;reverse=True)&nbsp;&nbsp; print(&quot;-&quot;*10+str(rq)+&quot;最勤奋奖&quot;+&quot;-&quot;*10) [print(f&quot;{i[0]}&nbsp;&nbsp;&nbsp;{i[1]}&quot;)&nbsp;for&nbsp;i&nbsp;in&nbsp;zd&nbsp;if&nbsp;i[1]&gt;=5] await&nbsp;browser.close()&nbsp; def&nbsp;main():&nbsp; url&nbsp;=&nbsp;&quot;https://openjudge.cn/auth/login/&quot;&nbsp; asyncio.get_event_loop().run_until_complete(getOjSourceCode(url))&nbsp; main()</code></p>