2月27日最勤奋学员榜单

周文扬发表于2021年02月27日

<img src="https://nos.netease.com/edu-image/4c3b9c0db0854f6ab5ca56e3e10fc342.jpg" /> <code class="brush:python;toolbar:false" >#用pyppeteer库模拟登陆OJ网站后，结合requests库进行快速获取数据的爬虫程序。 import asyncio  import pyppeteer as pyp  import bs4  import requests  import re import datetime rq=datetime.date.today() def sessionGetHtml(session,url): #发送带session的网页请求  fakeHeaders = { 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) \ AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36 Edg/81.0.416.77' }  # 伪装浏览器用的请求头  try:  result = session.get(url,headers = fakeHeaders)  result.encoding = result.apparent_encoding  return result.text  except Exception as e:  print(e)  return ""  async def makeSession(page): # 返回一个session,将其内部cookies修改成pypeteer浏览器页面对象中的cookies  cookies = await page.cookies() #cookies是一个列表，每个元素都是一个字典  cookies1 = {}  for cookie in cookies:  # requests中的cookies只要 "name"属性  cookies1[cookie['name']] = cookie['value']  session = requests.Session()  session.cookies.update(cookies1)  return session  async def antiAntiCrawler(page): #为page添加反反爬虫手段  await page.setUserAgent('Mozilla/5.0 (Windows NT 6.1; Win64; x64) \ ' 'AppleWebKit/537.36 (KHTML, like Gecko) ' 'Chrome/78.0.3904.70 Safari/537.36')  await page.evaluateOnNewDocument( '() =>{ Object.defineProperties(navigator,' '{ webdriver:{ get: () => false } }) }') def repc(u): he={ 'User-Agent':'Mozilla/5.0 (Windows NT 10.0;win64; x64)AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36', 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/siqned-exchange;v=b3;q=0.9' } return requests.get(u,headers=he).text ylb=[] async def getOjSourceCode(loginUrl):  global ylb width, height = 1400, 800  #网页宽高  browser = await pyp.launch(headless=False, userdataDir = "c:/tmp", args=[f'--window-size={width},{height}'])  page = await browser.newPage()  await antiAntiCrawler(page)  await page.setViewport({'width': width, 'height': height})  await page.goto(loginUrl)  await page.waitForSelector("#main>h2", timeout=30000) #等待手动登录后，“正在进行的比赛...."标题出现  zsj=[] js=1 while 1: u="https://cxsjsxmooc.openjudge.cn/2021pyspring/status/?page="+str(js) html=repc(u) zt="昨天</abbr>"  #只统计今天的数据，当网页源码中出现"昨天"时中止循环。 if len(re.findall(zt,html))>0: break xz='group-216/">(.+)</a>' #寻找用户名的正则表达式 n=re.findall(xz,html) zsj += n js += 1 #对用户当天做题量进行统计排序 zd={} for i in set(zsj): zd[i] =0 for i in zsj: zd[i] = zd.get(i)+1 zd=sorted(zd.items(), key=lambda item:item[1], reverse=True)   print("-"*10+str(rq)+"最勤奋奖"+"-"*10) [print(f"{i[0]}   {i[1]}") for i in zd if i[1]>=5] await browser.close()  def main():  url = "https://openjudge.cn/auth/login/"  asyncio.get_event_loop().run_until_complete(getOjSourceCode(url))  main()</code>

发表回复

2月27日最勤奋学员榜单

友情链接

关注我们

关于我们