[英]Getting 504 Gateway Time-out while running SplashRequest through ScrapySpider
在 VM VirtualBox 上运行 ubuntu。
运行 ifconfig 命令:
>docker0: flags=4099<UP,BROADCAST,MULTICAST> mtu 1500
inet 172.17.0.1 netmask 255.255.0.0 broadcast 172.17.255.255
inet6 fe80::42:90ff:fe9b:4d22 prefixlen 64 scopeid 0x20<link>
ether 02:42:90:9b:4d:22 txqueuelen 0 (Ethernet)
RX packets 10757 bytes 5983236 (5.9 MB)
RX errors 0 dropped 0 overruns 0 frame 0
TX packets 10909 bytes 15688953 (15.6 MB)
TX errors 0 dropped 0 overruns 0 carrier 0 collisions 0
enp0s3: flags=4163<UP,BROADCAST,RUNNING,MULTICAST> mtu 1500
inet 10.0.2.15 netmask 255.255.255.0 broadcast 10.0.2.255
inet6 fe80::6ec6:7ba3:79fa:8791 prefixlen 64 scopeid 0x20<link>
ether 08:00:27:92:a7:a7 txqueuelen 1000 (Ethernet)
RX packets 145146 bytes 145357306 (145.3 MB)
RX errors 0 dropped 0 overruns 0 frame 0
TX packets 79019 bytes 19069408 (19.0 MB)
TX errors 0 dropped 0 overruns 0 carrier 0 collisions 0
lo: flags=73<UP,LOOPBACK,RUNNING> mtu 65536
inet 127.0.0.1 netmask 255.0.0.0
inet6 ::1 prefixlen 128 scopeid 0x10<host>
loop txqueuelen 1000 (Local Loopback)
RX packets 8252 bytes 3265348 (3.2 MB)
RX errors 0 dropped 0 overruns 0 frame 0
TX packets 8252 bytes 3265348 (3.2 MB)
TX errors 0 dropped 0 overruns 0 carrier 0 collisions 0
Scrapy终端:
重试 GET http://www.gari.pk/used-cars-search/via http://172.17.0.1:8050/execute (失败 1 次):504 网关超时
飞溅终端:
2019-07-24 08:14:07.645051 [-] 服务器监听http://0.0.0.0:8050 libpng 警告:iCCP:已知不正确的 sRGB 配置文件 libpng 警告:iCCP:已知不正确的 sRGB 配置文件进程 1:D-Bus 库似乎设置不正确; 无法读取机器 uuid:UUID 文件“/etc/machine-id”应包含长度为 32 的十六进制字符串,而不是长度为 0,没有其他文本请参阅 dbus-uuidgen 的手册页以更正此问题。 qt.network.ssl:QSslSocket:无法解析 SSLv2_client_method qt.network.ssl:QSslSocket:无法解析 SSLv2_server_method
2019 年 7 月 24 日 08:14:40.935910 [事件] {“_id”:140385374548096,“client_ip”:“172.17.0.1”,“maxrss”:145496,“rendertime”:30385374548096,“rendertime”:30385374548096,“rendertime”:30385374548096 , "status_code":504, "load":[ 0.23, 0.25, 0.26 ], "error":{ "type":"GlobalTimeoutError", "info":{ "timeout":30 }, "description":"超时超出渲染页面", "error":504 }, "method":"POST", "qsize":0, "args":{ "headers":{ "Accept-Language":"en", "User-Agent “:” Scrapy / 1.6.0(+ https://scrapy.org )”, “接受”: “text / html的,应用/ XHTML + xml的,应用/ XML; q = 0.9,/ q = 0.8”} , "url":" http://www.gari.pk/used-cars-search/ ", "cookies":[ ], "uid":140385374548096, "lua_source":"\\n 函数 find_search_input(inputs)\\ n if #inputs == 1 then\\n return inputs[1]\\n else\\n for _, input in ipairs(inputs) do\\n if input.node.attributes.type == \\"search\\" then\\n返回输入\\n end\\n end\\n end\\n end\\n\\n function find_input(forms)\\n local potential = {}\\n\\n for _, form in ipairs(forms) do\\n local inputs = form .node:querySelectorAll('input:not([type= \\"hidden\\"])')\\n if #inputs ~= 0 then\\n local input = find_search_input(inputs)\\n if input then\\n return form, input\\n end\\n\\n potential[#potential + 1] = {input=inputs[1], form=form}\\n end\\n end\\n\\n return potential[1].form, potential[1].input\\n end\\n\\n function main(splash) , args)\\n -- 找到一个表单并提交 \\"splash\\" 给它\\n local function search_for_splash()\\n local forms = splash:select_all('form')\\n\\n if #forms == 0 then \\n 错误('找不到搜索表单')\\n 结束\\n\\n 本地表单,输入 = find_input(forms)\\n\\n 如果没有输入则\\n 错误('找不到搜索表单')\\n end\\n\\n assert(input:send_keys('honda'))\\n assert(splash:wait(0))\\n assert(form:submit())\\n end\\n\\n -- 主渲染脚本\\ n assert(splash:go(splash.args.url))\\n assert(splash:wait(5))\\n search_for_splash()\\n assert(splash:wait(15))\\n --assert(splash:runjs) ('search_query('', (100));'))\\n local button = splash:select('a[href*=\\"search_query\\"]')\\n button.node:setAttribute('href', \\"javascript: search_query('', (20))\\");\\nb utton:mouse_click()\\n assert(splash:wait(120))\\n \\n return {html = splash:html()}\\n end\\n " }, "timestamp":1563956080, "fds":21, "active":0, "user-agent":"Scrapy/1.6.0 (+ https://scrapy.org )" } 2019-07-24 08:14:40.936842 [-] "172.17.0.1" - - [24/Jul/2019:08:14:40 +0000] "POST /execute HTTP/1.1" 504 119 "-" "Scrapy/1.6.0 (+ https://scrapy.org )"
试过这个: docker run -p 8050:8050 scrapinghub/splash --max-timeout 240
蜘蛛网
import scrapy
import re
from scrapy_splash import SplashRequest
class GarispiderSpider(scrapy.Spider):
name = 'gariSpider'
allowed_domains = ['www.gari.pk']
start_urls = ['http://www.gari.pk/used-cars-search/']
lua_script = """
function find_search_input(inputs)
if #inputs == 1 then
return inputs[1]
else
for _, input in ipairs(inputs) do
if input.node.attributes.type == "search" then
return input
end
end
end
end
function find_input(forms)
local potential = {}
for _, form in ipairs(forms) do
local inputs = form.node:querySelectorAll('input:not([type="hidden"])')
if #inputs ~= 0 then
local input = find_search_input(inputs)
if input then
return form, input
end
potential[#potential + 1] = {input=inputs[1], form=form}
end
end
return potential[1].form, potential[1].input
end
function main(splash, args)
-- find a form and submit "splash" to it
local function search_for_splash()
local forms = splash:select_all('form')
if #forms == 0 then
error('no search form is found')
end
local form, input = find_input(forms)
if not input then
error('no search form is found')
end
assert(input:send_keys('honda'))
assert(splash:wait(0))
assert(form:submit())
end
-- main rendering script
assert(splash:go(splash.args.url))
assert(splash:wait(5))
search_for_splash()
assert(splash:wait(15))
--assert(splash:runjs('search_query('', (100));'))
local button = splash:select('a[href*="search_query"]')
button.node:setAttribute('href', "javascript: search_query('', (20))");
button:mouse_click()
assert(splash:wait(120))
return {html = splash:html()}
end
"""
def start_requests(self):
url=self.start_urls[0]
yield SplashRequest(url, callback=self.parse, endpoint='execute', args={'lua_source': self.lua_script})
def parse(self,response):
print(response.body)
###########################################################################
############################Setting.py#####################################
BOT_NAME = 'ScrappyApp'
SPIDER_MODULES = ['ScrappyApp.spiders']
NEWSPIDER_MODULE = 'ScrappyApp.spiders'
# SPLASH_URL = 'http://10.0.2.15:8050'
SPLASH_URL = 'http://172.17.0.1:8050'
DOWNLOADER_MIDDLEWARES = {
'scrapy_splash.SplashCookiesMiddleware': 723,
'scrapy_splash.SplashMiddleware': 725,
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
}
SPIDER_MIDDLEWARES = {
'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,
}
DUPEFILTER_CLASS = 'scrapy_splash.SplashAwareDupeFilter'
HTTPCACHE_STORAGE = 'scrapy_splash.SplashAwareFSCacheStorage'
ROBOTSTXT_OBEY = False
ITEM_PIPELINES = {
'ScrappyApp.pipelines.ScrappyappPipeline': 300,
}
我期望 lua 脚本之后的 html 正文。 但是通过http://172.17.0.1:8050/execute > 重试 http://www.gari.pk/used-cars-search/ >(失败 1 次):504 Gateway Time-out
没有准备好的 Scrapy 项目。 我将向您展示一个带有requests
的示例,但它可以轻松转换为 Scrapy 请求
import requests
r = requests.post('http://www.gari.pk/search-car-ajax.php', {'search_param': 'cars_mini/,/c_date desc/bmw'})
在r.content
您将获得包含您正在寻找的数据的响应。 对于分页,站点发出相同的请求但添加一些偏移量,您应该做的就是将此偏移量添加到数据中。 这是一个例子
r = requests.post('http://www.gari.pk/search-car-ajax.php', {'search_param': 'cars_mini/,/c_date desc/bmw/10'})
如您所见,添加 10 {'search_param': 'cars_mini/,/c_date desc/bmw/10'}
也许您可以为每个请求获取更多结果。 我建议您检查 Developerconcole->network->Xhr https://doc.scrapy.org/en/latest/topics/request-response.html#scrapy.http.FormRequest
yield scrapy.FormRequest('http://www.gari.pk/search-car-ajax.php', callback=self.parse,method='POST', formdata={'search_param': 'cars_mini/,/c_date desc/bmw/10'})
这是因为您要抓取的 url 返回transfer-encoding
标头。 我在Github上打开了一个关于这个的问题。
这里的脚本证明 (url httpbin.org/headers) 返回我根据请求发送的相同标头。
import requests
import json
ENDPOINT_SPLASH = 'http://localhost:8050/execute'
def test_with_custom_headers():
lua_script = """
function main(splash, args)
splash:set_custom_headers({
["x-custom-header"] = "splash"
})
assert(splash:go(args.url))
assert(splash:wait(0.5))
return {
html = splash:html()
}
end
"""
payload = {
'lua_source': lua_script,
'url': 'https://httpbin.org/headers',
'timeout': 15,
}
r = requests.post(url=ENDPOINT_SPLASH,
json=payload)
result = json.loads(r.text)
return result.get('html', result)
def test_with_content_encoding():
lua_script = """
function main(splash, args)
splash:set_custom_headers({
["transfer-encoding"] = "chunked"
})
assert(splash:go(args.url))
assert(splash:wait(0.5))
return {
html = splash:html()
}
end
"""
payload = {
'lua_source': lua_script,
'url': 'https://httpbin.org/headers',
'timeout': 15,
}
r = requests.post(url=ENDPOINT_SPLASH,
json=payload)
result = json.loads(r.text)
return result.get('html', result)
print("test_with_custom_headers: \n{}\n".format(test_with_custom_headers()))
print("test_with_content_encoding: \n{}".format(test_with_content_encoding()))
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.