## 安装pip |
``` |
pip install -r requirements.txt |
``` |
## 配置 tutorial/middlewares.py |
``` |
# 代理服务器 (配置上对应信息) |
proxyHost = "t.16yun.cn" |
proxyPort = "31111" |
# 代理隧道验证信息 |
proxyUser = "username" |
proxyPass = "password" |
``` |
``` |
./run.sh |
``` |
attrs==19.3.0 |
Automat==20.2.0 |
cffi==1.14.0 |
constantly==15.1.0 |
cryptography==2.9.2 |
cssselect==1.1.0 |
hyperlink==19.0.0 |
idna==2.9 |
incremental==17.5.0 |
lxml==4.5.0 |
parsel==1.5.2 |
Protego==0.1.16 |
pyasn1==0.4.8 |
pyasn1-modules==0.2.8 |
pycparser==2.20 |
PyDispatcher==2.0.5 |
PyHamcrest==2.0.2 |
pyOpenSSL==19.1.0 |
queuelib==1.5.0 |
Scrapy==2.1.0 |
scrapy-random-useragent==0.2 |
service-identity==18.1.0 |
six==1.14.0 |
Twisted==20.3.0 |
w3lib==1.21.0 |
zope.interface==5.1.0 |
# Automatically created by: scrapy startproject |
# For more information about the [deploy] section see: |
# https://scrapyd.readthedocs.io/en/latest/deploy.html |
[settings] |
default = tutorial.settings |
[deploy] |
#url = http://localhost:6800/ |
project = tutorial |
# -*- coding: utf-8 -*- |
# Define here the models for your scraped items |
import scrapy |
class TutorialItem(scrapy.Item): |
# define the fields for your item here like: |
# name = scrapy.Field() |
pass |
# -*- coding: utf-8 -*- |
# Define here the models for your spider middleware |
from scrapy import signals |
# ! -*- encoding:utf-8 -*- |
import base64 |
import sys |
import random |
PY3 = sys.version_info[0] >= 3 |
def base64ify(bytes_or_str): |
if PY3 and isinstance(bytes_or_str, str): |
input_bytes = bytes_or_str.encode('utf8') |
else: |
input_bytes = bytes_or_str |
output_bytes = base64.urlsafe_b64encode(input_bytes) |
if PY3: |
return output_bytes.decode('ascii') |
else: |
return output_bytes |
class ProxyMiddleware(object): |
def process_request(self, request, spider): |
# 代理服务器 |
proxyHost = "n10.t.16yun.cn" |
proxyPort = "31111" |
# 代理隧道验证信息 |
proxyUser = "username" |
proxyPass = "password" |
request.meta['proxy'] = "http://{0}:{1}".format(proxyHost, proxyPort) |
# 添加验证头 |
encoded_user_pass = base64ify(proxyUser + ":" + proxyPass) |
request.headers['Proxy-Authorization'] = 'Basic ' + encoded_user_pass |
# 设置IP切换头(根据需求) |
tunnel = random.randint(1, 10000) |
request.headers['Proxy-Tunnel'] = str(tunnel) |
class TutorialSpiderMiddleware(object): |
# Not all methods need to be defined. If a method is not defined, |
# scrapy acts as if the spider middleware does not modify the |
# passed objects. |
@classmethod |
def from_crawler(cls, crawler): |
# This method is used by Scrapy to create your spiders. |
s = cls() |
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) |
return s |
def process_spider_input(self, response, spider): |
# Called for each response that goes through the spider |
# middleware and into the spider. |
# Should return None or raise an exception. |
return None |
def process_spider_output(self, response, result, spider): |
# Called with the results returned from the Spider, after |
# it has processed the response. |
# Must return an iterable of Request, dict or Item objects. |
for i in result: |
yield i |
def process_spider_exception(self, response, exception, spider): |
# Called when a spider or process_spider_input() method |
# (from other spider middleware) raises an exception. |
# Should return either None or an iterable of Response, dict |
# or Item objects. |
pass |
def process_start_requests(self, start_requests, spider): |
# Called with the start requests of the spider, and works |
# similarly to the process_spider_output() method, except |
# that it doesn’t have a response associated. |
# Must return only requests (not items). |
for r in start_requests: |
yield r |
def spider_opened(self, spider): |
spider.logger.info('Spider opened: %s' % spider.name) |
class TutorialDownloaderMiddleware(object): |
# Not all methods need to be defined. If a method is not defined, |
# scrapy acts as if the downloader middleware does not modify the |
# passed objects. |
@classmethod |
def from_crawler(cls, crawler): |
# This method is used by Scrapy to create your spiders. |
s = cls() |
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) |
return s |
def process_request(self, request, spider): |
# Called for each request that goes through the downloader |
# middleware. |
# Must either: |
# - return None: continue processing this request |
# - or return a Response object |
# - or return a Request object |
# - or raise IgnoreRequest: process_exception() methods of |
# installed downloader middleware will be called |
return None |
def process_response(self, request, response, spider): |
# Called with the response returned from the downloader. |
# Must either; |
# - return a Response object |
# - return a Request object |
# - or raise IgnoreRequest |
return response |
def process_exception(self, request, exception, spider): |
# Called when a download handler or a process_request() |
# (from other downloader middleware) raises an exception. |
# Must either: |
# - return None: continue processing this exception |
# - return a Response object: stops process_exception() chain |
# - return a Request object: stops process_exception() chain |
pass |
def spider_opened(self, spider): |
spider.logger.info('Spider opened: %s' % spider.name) |
# -*- coding: utf-8 -*- |
# Define your item pipelines here |
# |
# Don't forget to add your pipeline to the ITEM_PIPELINES setting |
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html |
class TutorialPipeline(object): |
def process_item(self, item, spider): |
return item |
# -*- coding: utf-8 -*- |
# Scrapy settings for tutorial project |
BOT_NAME = 'tutorial' |
SPIDER_MODULES = ['tutorial.spiders'] |
NEWSPIDER_MODULE = 'tutorial.spiders' |
DUPEFILTER_CLASS = 'scrapy.dupefilters.BaseDupeFilter' |
'tutorial.middlewares.ProxyMiddleware': 100, |
'random_useragent.RandomUserAgentMiddleware': 400 |
} |
USER_AGENT_LIST = "useragents.txt" |
import scrapy |
class DemoSpider(scrapy.Spider): |
name = "demo" |
def start_requests(self): |
urls = [ |
'http://httpbin.org/ip', |
'http://httpbin.org/ip', |
'http://httpbin.org/headers', |
] |
for url in urls: |
yield scrapy.Request(url=url, callback=self.parse) |
def parse(self, response): |
self.log('{}'.format(response.body)) |
Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv: Gecko/20110319 Firefox/40 |
Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; QQBrowser/8.3.4769.400) |
MozillaTest/5.0 (compatible; YodaoBot/1.0; http://www.yodao.com/help/webmaster/spider/; ) |
Mozilla/5.0 (Windows NT 6.2; rv:39.0) Gecko/20100101 Firefox/39.0 |
Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0; QQBrowser/8.3.4769.400) |
Mozilla/5.0 (Windows NT 6.1; rv:39.0) Gecko/20100101 Firefox/39.0 |
Reference in new issue