I am trying to get around Scrapy's requirement of feeding the spider with URLs from start_urls array or start_requests generator. Instead, I would like to kick the downloader (and subsequent downloader middlewarew) from a callback that is triggered by RabbigMQ's Twisted protocol whenever it receives a message on a particular exchange.
Following, I have a simple example of a consumer and producer app. The producer app is a Python program that sends URLs to RabbitMQ exchange. The consumer is a non-blocking Twisted protocol that simply prints them to stdout.
Here's the sample consumer Twisted protocol. It listens to routing key urls on exchange quotes:
import pika
from pika import spec
from pika.adapters import twisted_connection
from pika.exchange_type import ExchangeType
from twisted.internet import reactor, protocol, defer
from twisted.internet.defer import inlineCallbacks
from twisted.python import log
PREFETCH_COUNT = 2
EXCHANGE = "quotes"
ROUTING_KEY = "urls"
class PikaProtocol(twisted_connection.TwistedProtocolConnection):
connected = False
name = "AMQP:Protocol"
def __init__(self, factory, parameters):
super().__init__(parameters)
self.factory = factory
@inlineCallbacks
def connectionReady(self):
self._channel = yield self.channel()
yield self._channel.basic_qos(prefetch_count=PREFETCH_COUNT)
self.connected = True
yield self._channel.confirm_delivery()
for (
exchange,
routing_key,
callback,
) in self.factory.read_list:
yield self.setup_read(exchange, routing_key, callback)
self.send()
@inlineCallbacks
def read(self, exchange, routing_key, callback):
"""Add an exchange to the list of exchanges to read from."""
if self.connected:
yield self.setup_read(exchange, routing_key, callback)
@inlineCallbacks
def setup_read(self, exchange, routing_key, callback):
"""This function does the work to read from an exchange."""
if exchange:
yield self._channel.exchange_declare(
exchange=exchange,
exchange_type=ExchangeType.topic,
durable=True,
auto_delete=False,
)
yield self._channel.queue_declare(queue=routing_key, durable=True)
if exchange:
yield self._channel.queue_bind(queue=routing_key, exchange=exchange)
yield self._channel.queue_bind(
queue=routing_key, exchange=exchange, routing_key=routing_key
)
(
queue,
_consumer_tag,
) = yield self._channel.basic_consume(queue=routing_key, auto_ack=False)
d = queue.get()
d.addCallback(self._read_item, queue, callback)
d.addErrback(self._read_item_err)
def _read_item(self, item, queue, callback):
"""Callback function which is called when an item is read."""
d = queue.get()
d.addCallback(self._read_item, queue, callback)
d.addErrback(self._read_item_err)
(
channel,
deliver,
_props,
msg,
) = item
log.msg(
"%s (%s): %s" % (deliver.exchange, deliver.routing_key, repr(msg)),
system="Pika:<=",
)
d = defer.maybeDeferred(callback, item)
d.addCallbacks(
lambda _: channel.basic_ack(deliver.delivery_tag),
lambda _: channel.basic_nack(deliver.delivery_tag),
)
@staticmethod
def _read_item_err(error):
print(error)
def send(self):
"""If connected, send all waiting messages."""
if self.connected:
while self.factory.queued_messages:
(
exchange,
r_key,
message,
) = self.factory.queued_messages.pop(0)
self.send_message(exchange, r_key, message)
@inlineCallbacks
def send_message(self, exchange, routing_key, msg):
"""Send a single message."""
log.msg("%s (%s): %s" % (exchange, routing_key, repr(msg)), system="Pika:=>")
yield self._channel.exchange_declare(
exchange=exchange,
exchange_type=ExchangeType.topic,
durable=True,
auto_delete=False,
)
prop = spec.BasicProperties(delivery_mode=2)
try:
yield self._channel.basic_publish(
exchange=exchange, routing_key=routing_key, body=msg, properties=prop
)
except Exception as error: # pylint: disable=W0703
log.msg("Error while sending message: %s" % error, system=self.name)
class PikaFactory(protocol.ReconnectingClientFactory):
name = "AMQP:Factory"
def __init__(self, parameters):
self.parameters = parameters
self.client = None
self.queued_messages = []
self.read_list = []
def startedConnecting(self, connector):
log.msg("Started to connect.", system=self.name)
def buildProtocol(self, addr):
self.resetDelay()
log.msg("Connected", system=self.name)
self.client = PikaProtocol(self, self.parameters)
return self.client
def clientConnectionLost(self, connector, reason): # pylint: disable=W0221
log.msg("Lost connection. Reason: %s" % reason.value, system=self.name)
protocol.ReconnectingClientFactory.clientConnectionLost(self, connector, reason)
def clientConnectionFailed(self, connector, reason):
log.msg("Connection failed. Reason: %s" % reason.value, system=self.name)
protocol.ReconnectingClientFactory.clientConnectionFailed(
self, connector, reason
)
def send_message(self, exchange=None, routing_key=None, message=None):
self.queued_messages.append((exchange, routing_key, message))
if self.client is not None:
self.client.send()
def read_messages(self, exchange, routing_key, callback):
"""Configure an exchange to be read from."""
self.read_list.append((exchange, routing_key, callback))
if self.client is not None:
self.client.read(exchange, routing_key, callback)
def main():
parameters = pika.ConnectionParameters(
host="localhost",
virtual_host="/",
credentials=pika.PlainCredentials("guest", "guest"),
)
def callback(tup):
url = tup[-1].decode('utf-8')
print(f'Received {url}')
f = PikaFactory(parameters)
f.read_messages(EXCHANGE, ROUTING_KEY, callback)
reactor.connectTCP(parameters._host, parameters._port, f)
reactor.run()
if __name__ == "__main__":
main()
Here is how you would feed URLs to it (and see them printed on stdout):
#!/usr/bin/env python
import pika
connection = pika.BlockingConnection(
pika.ConnectionParameters(host='localhost'))
channel = connection.channel()
channel.basic_publish(exchange='quotes', routing_key='urls', body='http://quotes.toscrape.com/page/1/')
print(" [x] Sent url to quotes exchange")
connection.close()
How do I run the quotes spider downloader (instead of simply printing the URLs to stdout)?
Thanks!
it might help you, I struggled and I finally managed to let my spider consume rabbitmq through the TwistedAdapter
class TwistedConsumer(scrapy.Spider):
""" Twisted compatible RabbitMQ Consumer
"""
start_urls = []
def start_consuming(self, name):
self.channel = None
self.exchange = None
self.queue = None
self.queue_object = None
self.consumer_tag = None
self.user = get_project_settings().get('RABBITMQ_USER')
self.password = get_project_settings().get('RABBITMQ_PASSWORD')
self.host = get_project_settings().get('RABBITMQ_HOST')
self.port = get_project_settings().get('RABBITMQ_PORT')
self.virtual_host = get_project_settings().get('RABBITMQ_VIRTUAL_HOST')
self.queue = get_project_settings().get('RABBITMQ_QUEUE_NAME')
self.exchange = get_project_settings().get('RABBITMQ_EXCHANGE')
self.routing_key = get_project_settings().get('RABBITMQ_ROUTING_KEY')
credentials = pika.PlainCredentials(self.user, self.password)
parameters = pika.ConnectionParameters(self.host,
self.port,
self.virtual_host,
credentials,
heartbeat=10)
# a deffered of a connection
cc = protocol.ClientCreator(
reactor,
twisted_connection.TwistedProtocolConnection,
parameters
)
self.deferedConnection = cc.connectTCP(self.host, self.port)
self.deferedConnection.addCallback(lambda protocol: protocol.ready)
self.deferedConnection.addCallback(self.run)
self.deferedConnection.addCallback(self.populate_start_urls)
self.crawler.signals.connect(self.spider_idle, signal=signals.spider_idle)
self.crawler.signals.connect(self.item_scraped, signal=signals.item_scraped)
def populate_start_urls(self):
self.start_urls = [self.queue_object.get()]
def spider_idle(self):
""" Waits for request to be scheduled.
:return: None
"""
raise DontCloseSpider
def item_scraped(self, *args, **kwargs):
self.logger.info(f"Item scraped")
@defer.inlineCallbacks
def run(self, connection):
"""Receives the connection as parameter and then consumes from the queue
periodically.
"""
self.channel = yield connection.channel()
yield self.channel.exchange_declare(exchange=self.exchange,
auto_delete=True,
exchange_type="direct",
durable=True)
yield self.channel.queue_declare(queue=self.queue,
durable=True,
arguments={
... # put whatever arg you may need
})
self.channel.queue_bind(exchange=self.exchange,
routing_key=self.routing_key,
queue=self.queue)
yield self.channel.basic_qos(prefetch_count=10)
self.queue_object, self.consumer_tag = yield self.channel.basic_consume(
queue=self.queue,
auto_ack=True)
l = task.LoopingCall(self.read, self.queue_object)
l.start(interval=0.01)
def next_request(body):
raise NotImplementedError
@defer.inlineCallbacks
def read(self, queue_object):
"""Responsible for consuming the queue. See that it ACKs at the end.
"""
# queue_object.get() is a ClosableDeferredQueue, hence, a Deferred
ch, method, properties, body = yield queue_object.get()
if body:
self.next_request(body)
And then implement a spider inheriting from it:
class MySpider(TwistedConsumer):
name = "my_spider"
start_urls = []
@classmethod
def from_crawler(cls, crawler):
spider = super(LMMRbricolageOnDemandSpider, cls).from_crawler(crawler)
spider.start_consuming(spider.name)
return spider
def next_request(self, body):
# parse your body here
# ....
# let's assume your url is coming from the body
request = scrapy.Request(url,cookies=cookies,callback=self.parse)
self.crawler.engine.crawl(request, self)
def parse(self, response):
# do your parsing stuff here
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.