Scrapy-Socks5代理中间件

Scrapy-Socks5代理中间件

Demo

环境: Python(2.7+) + Scrapy(1.1.1) + Twisted(16.6.0)
官网没直接提供Socks 代理中间件 。
所以写一个代理中间件 。
需要依赖库 txsocksx 。
pip install txsocksx

midddlewares.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
# -*- coding: utf-8 -*-
# 需要依赖 txsocksx
# pip install txsocksx


from txsocksx.http import SOCKS5Agent
from twisted.internet import reactor
from scrapy.xlib.tx import TCP4ClientEndpoint
from scrapy.core.downloader.webclient import _parse
from scrapy.core.downloader.handlers.http11 import HTTP11DownloadHandler, ScrapyAgent

proxyHost = "xxxx.com"
proxyPort = 9020
proxyUser = "1234"
proxyPass = "pass"

class Socks5DownloadHandler(HTTP11DownloadHandler):

def download_request(self, request, spider):
"""Return a deferred for the HTTP download"""
agent = ScrapySocks5Agent(contextFactory=self._contextFactory, pool=self._pool)
return agent.download_request(request)

class ScrapySocks5Agent(ScrapyAgent):

def _get_agent(self, request, timeout):
# bindAddress = request.meta.get('bindaddress') or self._bindAddress
#proxy = request.meta.get('proxy')
#if proxy:
#_, _, proxyHost, proxyPort, proxyParams = _parse(proxy)
#_, _, host, port, proxyParams = _parse(request.url)
proxyEndpoint = TCP4ClientEndpoint(reactor, proxyHost, proxyPort, endpointArgs=dict(methods={'login': {proxyUser, proxyPass}}))
agent = SOCKS5Agent(reactor, proxyEndpoint=proxyEndpoint)
return agent
# return self._Agent(reactor, contextFactory=self._contextFactory,
# connectTimeout=timeout, bindAddress=bindAddress, pool=self._pool)

settings.py

1
2
3
4
5
DOWNLOAD_HANDLERS = {
"http": "myspider.midddlewares.Socks5DownloadHandler",
"https": "myspider.midddlewares.Socks5DownloadHandler"

}

End