You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
162 lines
5.4 KiB
162 lines
5.4 KiB
5 years ago
|
import websockets
|
||
|
from scrapy.http import HtmlResponse
|
||
|
from logging import getLogger
|
||
|
import asyncio
|
||
|
import pyppeteer
|
||
|
import logging
|
||
|
from concurrent.futures._base import TimeoutError
|
||
|
import base64
|
||
|
import sys
|
||
|
import random
|
||
|
|
||
|
pyppeteer_level = logging.WARNING
|
||
|
logging.getLogger('websockets.protocol').setLevel(pyppeteer_level)
|
||
|
logging.getLogger('pyppeteer').setLevel(pyppeteer_level)
|
||
|
|
||
|
PY3 = sys.version_info[0] >= 3
|
||
|
|
||
|
|
||
|
def base64ify(bytes_or_str):
|
||
|
if PY3 and isinstance(bytes_or_str, str):
|
||
|
input_bytes = bytes_or_str.encode('utf8')
|
||
|
else:
|
||
|
input_bytes = bytes_or_str
|
||
|
|
||
|
output_bytes = base64.urlsafe_b64encode(input_bytes)
|
||
|
if PY3:
|
||
|
return output_bytes.decode('ascii')
|
||
|
else:
|
||
|
return output_bytes
|
||
|
|
||
|
|
||
|
class ProxyMiddleware(object):
|
||
|
USER_AGENT = open('useragents.txt').readlines()
|
||
|
|
||
|
def process_request(self, request, spider):
|
||
|
# 代理服务器
|
||
|
# proxyHost = "t.16yun.cn"
|
||
|
# proxyPort = "31111"
|
||
|
proxyHost = "u1.5.tn.16yun.cn"
|
||
|
proxyPort = "6441"
|
||
|
# 代理隧道验证信息
|
||
|
# proxyUser = "username"
|
||
|
# proxyPass = "password"
|
||
|
proxyUser = "16ZJZYVL"
|
||
|
proxyPass = "113813"
|
||
|
|
||
|
request.meta['proxy'] = "http://{0}:{1}".format(proxyHost, proxyPort)
|
||
|
|
||
|
# 添加验证头
|
||
|
encoded_user_pass = base64ify(proxyUser + ":" + proxyPass)
|
||
|
request.headers['Proxy-Authorization'] = 'Basic ' + encoded_user_pass
|
||
|
|
||
|
# 设置IP切换头(根据需求)
|
||
|
tunnel = random.randint(1, 10000)
|
||
|
request.headers['Proxy-Tunnel'] = str(tunnel)
|
||
|
request.headers['User-Agent'] = random.choice(self.USER_AGENT)
|
||
|
|
||
|
|
||
|
class PyppeteerMiddleware(object):
|
||
|
def __init__(self, **args):
|
||
|
"""
|
||
|
init logger, loop, browser
|
||
|
:param args:
|
||
|
"""
|
||
|
self.logger = getLogger(__name__)
|
||
|
self.loop = asyncio.get_event_loop()
|
||
|
self.browser = self.loop.run_until_complete(
|
||
|
pyppeteer.launch(headless=True))
|
||
|
self.args = args
|
||
|
|
||
|
def __del__(self):
|
||
|
"""
|
||
|
close loop
|
||
|
:return:
|
||
|
"""
|
||
|
self.loop.close()
|
||
|
|
||
|
def render(self, url, retries=1, script=None, wait=0.3, scrolldown=False, sleep=0,
|
||
|
timeout=8.0, keep_page=False):
|
||
|
"""
|
||
|
render page with pyppeteer
|
||
|
:param url: page url
|
||
|
:param retries: max retry times
|
||
|
:param script: js script to evaluate
|
||
|
:param wait: number of seconds to wait before loading the page, preventing timeouts
|
||
|
:param scrolldown: how many times to page down
|
||
|
:param sleep: how many long to sleep after initial render
|
||
|
:param timeout: the longest wait time, otherwise raise timeout error
|
||
|
:param keep_page: keep page not to be closed, browser object needed
|
||
|
:param browser: pyppetter browser object
|
||
|
:param with_result: return with js evaluation result
|
||
|
:return: content, [result]
|
||
|
"""
|
||
|
|
||
|
# define async render
|
||
|
async def async_render(url, script, scrolldown, sleep, wait, timeout, keep_page):
|
||
|
try:
|
||
|
# basic render
|
||
|
page = await self.browser.newPage()
|
||
|
await asyncio.sleep(wait)
|
||
|
response = await page.goto(url, options={'timeout': int(timeout * 1000)})
|
||
|
if response.status != 200:
|
||
|
return None, None, response.status
|
||
|
result = None
|
||
|
# evaluate with script
|
||
|
if script:
|
||
|
result = await page.evaluate(script)
|
||
|
|
||
|
# scroll down for {scrolldown} times
|
||
|
if scrolldown:
|
||
|
for _ in range(scrolldown):
|
||
|
await page._keyboard.down('PageDown')
|
||
|
await asyncio.sleep(sleep)
|
||
|
else:
|
||
|
await asyncio.sleep(sleep)
|
||
|
if scrolldown:
|
||
|
await page._keyboard.up('PageDown')
|
||
|
|
||
|
# get html of page
|
||
|
content = await page.content()
|
||
|
|
||
|
return content, result, response.status
|
||
|
except TimeoutError:
|
||
|
return None, None, 500
|
||
|
finally:
|
||
|
# if keep page, do not close it
|
||
|
if not keep_page:
|
||
|
await page.close()
|
||
|
|
||
|
content, result, status = [None] * 3
|
||
|
|
||
|
# retry for {retries} times
|
||
|
for i in range(retries):
|
||
|
if not content:
|
||
|
content, result, status = self.loop.run_until_complete(
|
||
|
async_render(url=url, script=script, sleep=sleep, wait=wait,
|
||
|
scrolldown=scrolldown, timeout=timeout, keep_page=keep_page))
|
||
|
else:
|
||
|
break
|
||
|
|
||
|
# if need to return js evaluation result
|
||
|
return content, result, status
|
||
|
|
||
|
def process_request(self, request, spider):
|
||
|
"""
|
||
|
:param request: request object
|
||
|
:param spider: spider object
|
||
|
:return: HtmlResponse
|
||
|
"""
|
||
|
if request.meta.get('render'):
|
||
|
try:
|
||
|
self.logger.debug('rendering %s', request.url)
|
||
|
html, result, status = self.render(request.url)
|
||
|
return HtmlResponse(url=request.url, body=html, request=request, encoding='utf-8',
|
||
|
status=status)
|
||
|
except websockets.exceptions.ConnectionClosed:
|
||
|
pass
|
||
|
|
||
|
@classmethod
|
||
|
def from_crawler(cls, crawler):
|
||
|
return cls(**crawler.settings.get('PYPPETEER_ARGS', {}))
|