Browse Source

init

master
jyhl 4 years ago
commit
51931f9cd9
  1. 3
      requirements.dev.txt
  2. 32
      requirements.txt
  3. 11
      scrapy.cfg
  4. 0
      scrapypyppeteer/__init__.py
  5. 14
      scrapypyppeteer/items.py
  6. 161
      scrapypyppeteer/middlewares.py
  7. 11
      scrapypyppeteer/pipelines.py
  8. 98
      scrapypyppeteer/settings.py
  9. 15
      scrapypyppeteer/spiders/16yun.py
  10. 4
      scrapypyppeteer/spiders/__init__.py
  11. BIN
      scrapypyppeteer/spiders/__pycache__/16yun.cpython-36.pyc
  12. BIN
      scrapypyppeteer/spiders/__pycache__/__init__.cpython-36.pyc
  13. BIN
      scrapypyppeteer/spiders/__pycache__/quotes.cpython-36.pyc
  14. BIN
      scrapypyppeteer/spiders/__pycache__/taobao.cpython-36.pyc
  15. 18
      scrapypyppeteer/spiders/quotes.py
  16. 18
      scrapypyppeteer/spiders/taobao.py
  17. 6
      useragents.txt

3
requirements.dev.txt

@ -0,0 +1,3 @@
pyppeteer
scrapy
scrapy-random-useragent

32
requirements.txt

@ -0,0 +1,32 @@
appdirs==1.4.4
attrs==19.3.0
Automat==20.2.0
cffi==1.14.0
constantly==15.1.0
cryptography==2.9.2
cssselect==1.1.0
hyperlink==19.0.0
idna==2.9
incremental==17.5.0
lxml==4.5.0
parsel==1.6.0
Protego==0.1.16
pyasn1==0.4.8
pyasn1-modules==0.2.8
pycparser==2.20
PyDispatcher==2.0.5
pyee==7.0.1
PyHamcrest==2.0.2
pyOpenSSL==19.1.0
pyppeteer==0.2.2
queuelib==1.5.0
Scrapy==2.1.0
scrapy-random-useragent==0.2
service-identity==18.1.0
six==1.14.0
tqdm==4.46.0
Twisted==20.3.0
urllib3==1.25.9
w3lib==1.21.0
websockets==8.1
zope.interface==5.1.0

11
scrapy.cfg

@ -0,0 +1,11 @@
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.org/en/latest/deploy.html
[settings]
default = scrapypyppeteer.settings
[deploy]
#url = http://localhost:6800/
project = scrapypyppeteer

0
scrapypyppeteer/__init__.py

14
scrapypyppeteer/items.py

@ -0,0 +1,14 @@
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class ScrapypyppeteerItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
pass

161
scrapypyppeteer/middlewares.py

@ -0,0 +1,161 @@
import websockets
from scrapy.http import HtmlResponse
from logging import getLogger
import asyncio
import pyppeteer
import logging
from concurrent.futures._base import TimeoutError
import base64
import sys
import random
pyppeteer_level = logging.WARNING
logging.getLogger('websockets.protocol').setLevel(pyppeteer_level)
logging.getLogger('pyppeteer').setLevel(pyppeteer_level)
PY3 = sys.version_info[0] >= 3
def base64ify(bytes_or_str):
if PY3 and isinstance(bytes_or_str, str):
input_bytes = bytes_or_str.encode('utf8')
else:
input_bytes = bytes_or_str
output_bytes = base64.urlsafe_b64encode(input_bytes)
if PY3:
return output_bytes.decode('ascii')
else:
return output_bytes
class ProxyMiddleware(object):
USER_AGENT = open('useragents.txt').readlines()
def process_request(self, request, spider):
# 代理服务器
# proxyHost = "t.16yun.cn"
# proxyPort = "31111"
proxyHost = "u1.5.tn.16yun.cn"
proxyPort = "6441"
# 代理隧道验证信息
# proxyUser = "username"
# proxyPass = "password"
proxyUser = "16ZJZYVL"
proxyPass = "113813"
request.meta['proxy'] = "http://{0}:{1}".format(proxyHost, proxyPort)
# 添加验证头
encoded_user_pass = base64ify(proxyUser + ":" + proxyPass)
request.headers['Proxy-Authorization'] = 'Basic ' + encoded_user_pass
# 设置IP切换头(根据需求)
tunnel = random.randint(1, 10000)
request.headers['Proxy-Tunnel'] = str(tunnel)
request.headers['User-Agent'] = random.choice(self.USER_AGENT)
class PyppeteerMiddleware(object):
def __init__(self, **args):
"""
init logger, loop, browser
:param args:
"""
self.logger = getLogger(__name__)
self.loop = asyncio.get_event_loop()
self.browser = self.loop.run_until_complete(
pyppeteer.launch(headless=True))
self.args = args
def __del__(self):
"""
close loop
:return:
"""
self.loop.close()
def render(self, url, retries=1, script=None, wait=0.3, scrolldown=False, sleep=0,
timeout=8.0, keep_page=False):
"""
render page with pyppeteer
:param url: page url
:param retries: max retry times
:param script: js script to evaluate
:param wait: number of seconds to wait before loading the page, preventing timeouts
:param scrolldown: how many times to page down
:param sleep: how many long to sleep after initial render
:param timeout: the longest wait time, otherwise raise timeout error
:param keep_page: keep page not to be closed, browser object needed
:param browser: pyppetter browser object
:param with_result: return with js evaluation result
:return: content, [result]
"""
# define async render
async def async_render(url, script, scrolldown, sleep, wait, timeout, keep_page):
try:
# basic render
page = await self.browser.newPage()
await asyncio.sleep(wait)
response = await page.goto(url, options={'timeout': int(timeout * 1000)})
if response.status != 200:
return None, None, response.status
result = None
# evaluate with script
if script:
result = await page.evaluate(script)
# scroll down for {scrolldown} times
if scrolldown:
for _ in range(scrolldown):
await page._keyboard.down('PageDown')
await asyncio.sleep(sleep)
else:
await asyncio.sleep(sleep)
if scrolldown:
await page._keyboard.up('PageDown')
# get html of page
content = await page.content()
return content, result, response.status
except TimeoutError:
return None, None, 500
finally:
# if keep page, do not close it
if not keep_page:
await page.close()
content, result, status = [None] * 3
# retry for {retries} times
for i in range(retries):
if not content:
content, result, status = self.loop.run_until_complete(
async_render(url=url, script=script, sleep=sleep, wait=wait,
scrolldown=scrolldown, timeout=timeout, keep_page=keep_page))
else:
break
# if need to return js evaluation result
return content, result, status
def process_request(self, request, spider):
"""
:param request: request object
:param spider: spider object
:return: HtmlResponse
"""
if request.meta.get('render'):
try:
self.logger.debug('rendering %s', request.url)
html, result, status = self.render(request.url)
return HtmlResponse(url=request.url, body=html, request=request, encoding='utf-8',
status=status)
except websockets.exceptions.ConnectionClosed:
pass
@classmethod
def from_crawler(cls, crawler):
return cls(**crawler.settings.get('PYPPETEER_ARGS', {}))

11
scrapypyppeteer/pipelines.py

@ -0,0 +1,11 @@
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
class ScrapypyppeteerPipeline(object):
def process_item(self, item, spider):
return item

98
scrapypyppeteer/settings.py

@ -0,0 +1,98 @@
# -*- coding: utf-8 -*-
# Scrapy settings for scrapypyppeteer project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# http://doc.scrapy.org/en/latest/topics/settings.html
# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'scrapypyppeteer'
SPIDER_MODULES = ['scrapypyppeteer.spiders']
NEWSPIDER_MODULE = 'scrapypyppeteer.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
# USER_AGENT = 'scrapypyppeteer (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
# Configure maximum concurrent requests performed by Scrapy (default: 16)
# CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
# DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
# CONCURRENT_REQUESTS_PER_DOMAIN = 16
# CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
# COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
# TELNETCONSOLE_ENABLED = False
# Override the default request headers:
# DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
# }
# Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
# SPIDER_MIDDLEWARES = {
# 'scrapypyppeteer.middlewares.ScrapypyppeteerSpiderMiddleware': 543,
# }
# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {
'scrapypyppeteer.middlewares.PyppeteerMiddleware': 543,
'scrapypyppeteer.middlewares.ProxyMiddleware': 100,
}
# Enable or disable extensions
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
# EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
# }
# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
# ITEM_PIPELINES = {
# 'scrapypyppeteer.pipelines.ScrapypyppeteerPipeline': 300,
# }
# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
# AUTOTHROTTLE_ENABLED = True
# The initial download delay
# AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
# AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
# AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
# HTTPCACHE_ENABLED = True
# HTTPCACHE_EXPIRATION_SECS = 0
# HTTPCACHE_DIR = 'httpcache'
# HTTPCACHE_IGNORE_HTTP_CODES = []
# HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
PYPPETEER_ARGS = {
'timeout': 8
}
LOG_LEVEL = 'INFO'

15
scrapypyppeteer/spiders/16yun.py

@ -0,0 +1,15 @@
# -*- coding: utf-8 -*-
from scrapy import Spider, Request
class YiniuYunSpider(Spider):
name = '16yun'
allowed_domains = ['current-ip.16yun.cn']
start_url = 'http://current-ip.16yun.cn:802/ip'
def start_requests(self):
yield Request(self.start_url, callback=self.parse_list)
def parse_list(self, response):
with open('16yun.html', 'w', encoding='utf-8') as f:
f.write(response.text)

4
scrapypyppeteer/spiders/__init__.py

@ -0,0 +1,4 @@
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.

BIN
scrapypyppeteer/spiders/__pycache__/16yun.cpython-36.pyc

Binary file not shown.

BIN
scrapypyppeteer/spiders/__pycache__/__init__.cpython-36.pyc

Binary file not shown.

BIN
scrapypyppeteer/spiders/__pycache__/quotes.cpython-36.pyc

Binary file not shown.

BIN
scrapypyppeteer/spiders/__pycache__/taobao.cpython-36.pyc

Binary file not shown.

18
scrapypyppeteer/spiders/quotes.py

@ -0,0 +1,18 @@
# -*- coding: utf-8 -*-
import scrapy
class QuotesSpider(scrapy.Spider):
name = 'quotes'
allowed_domains = ['quotes.toscrape.com']
start_urls = ['http://quotes.toscrape.com/js/']
def parse(self, response):
for quote in response.css('div.quote'):
yield {
'text': quote.css('span.text::text').extract_first(),
'author': quote.css('small.author::text').extract_first(),
'tags': quote.css('div.tags > a.tag::text').extract()
}
with open('quotes.js.enable.html', 'w', encoding='utf-8') as f:
f.write(response.text)

18
scrapypyppeteer/spiders/taobao.py

@ -0,0 +1,18 @@
# -*- coding: utf-8 -*-
from scrapy import Spider, Request
class TaobaoSpider(Spider):
name = 'taobao'
allowed_domains = ['s.taobao.com']
start_url = 'http://s.taobao.com/search?q={keyword}'
keywords = ['ipad']
def start_requests(self):
for keyword in self.keywords:
url = self.start_url.format(keyword=keyword)
yield Request(url, callback=self.parse_list)
def parse_list(self, response):
with open('taobao.html', 'w', encoding='utf-8') as f:
f.write(response.text)

6
useragents.txt

@ -0,0 +1,6 @@
Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.2.16) Gecko/20110319 Firefox/40
Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; QQBrowser/8.3.4769.400)
MozillaTest/5.0 (compatible; YodaoBot/1.0; http://www.yodao.com/help/webmaster/spider/; )
Mozilla/5.0 (Windows NT 6.2; rv:39.0) Gecko/20100101 Firefox/39.0
Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0; QQBrowser/8.3.4769.400)
Mozilla/5.0 (Windows NT 6.1; rv:39.0) Gecko/20100101 Firefox/39.0
Loading…
Cancel
Save