- 综合讨论区
- 帖子详情
2月27日最勤奋学员榜单
周文扬
发表于2021年02月27日
<p><img src="https://nos.netease.com/edu-image/4c3b9c0db0854f6ab5ca56e3e10fc342.jpg" /></p><p><br /></p><p><code class="brush:python;toolbar:false" >#用pyppeteer库模拟登陆OJ网站后,结合requests库进行快速获取数据的爬虫程序。
import asyncio
import pyppeteer as pyp
import bs4
import requests
import re
import datetime
rq=datetime.date.today()
def sessionGetHtml(session,url): #发送带session的网页请求
fakeHeaders = { 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) \
AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36 Edg/81.0.416.77' } # 伪装浏览器用的请求头
try:
result = session.get(url,headers = fakeHeaders)
result.encoding = result.apparent_encoding
return result.text
except Exception as e:
print(e)
return ""
async def makeSession(page): # 返回一个session,将其内部cookies修改成pypeteer浏览器页面对象中的cookies
cookies = await page.cookies() #cookies是一个列表,每个元素都是一个字典
cookies1 = {}
for cookie in cookies: # requests中的cookies只要 "name"属性
cookies1[cookie['name']] = cookie['value']
session = requests.Session()
session.cookies.update(cookies1)
return session
async def antiAntiCrawler(page): #为page添加反反爬虫手段
await page.setUserAgent('Mozilla/5.0 (Windows NT 6.1; Win64; x64) \
' 'AppleWebKit/537.36 (KHTML, like Gecko) ' 'Chrome/78.0.3904.70 Safari/537.36')
await page.evaluateOnNewDocument( '() =>{ Object.defineProperties(navigator,' '{ webdriver:{ get: () => false } }) }')
def repc(u):
he={
'User-Agent':'Mozilla/5.0 (Windows NT 10.0;win64; x64)AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36',
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/siqned-exchange;v=b3;q=0.9'
}
return requests.get(u,headers=he).text
ylb=[]
async def getOjSourceCode(loginUrl):
global ylb
width, height = 1400, 800 #网页宽高
browser = await pyp.launch(headless=False, userdataDir = "c:/tmp", args=[f'--window-size={width},{height}'])
page = await browser.newPage()
await antiAntiCrawler(page)
await page.setViewport({'width': width, 'height': height})
await page.goto(loginUrl)
await page.waitForSelector("#main>h2", timeout=30000) #等待手动登录后,“正在进行的比赛...."标题出现
zsj=[]
js=1
while 1:
u="https://cxsjsxmooc.openjudge.cn/2021pyspring/status/?page="+str(js)
html=repc(u)
zt="昨天</abbr>" #只统计今天的数据,当网页源码中出现"昨天"时中止循环。
if len(re.findall(zt,html))>0:
break
xz='group-216/">(.+)</a>' #寻找用户名的正则表达式
n=re.findall(xz,html)
zsj += n
js += 1
#对用户当天做题量进行统计排序
zd={}
for i in set(zsj):
zd[i] =0
for i in zsj:
zd[i] = zd.get(i)+1
zd=sorted(zd.items(), key=lambda item:item[1], reverse=True)
print("-"*10+str(rq)+"最勤奋奖"+"-"*10)
[print(f"{i[0]} {i[1]}") for i in zd if i[1]>=5]
await browser.close()
def main():
url = "https://openjudge.cn/auth/login/"
asyncio.get_event_loop().run_until_complete(getOjSourceCode(url))
main()</code></p>