python - Scrapy not parsing response in make_requests_from_url loop -
i'm trying scrapy grab url message queue, , scrape url. have loop going fine , grabbing url queue, never enters parse()
method once has url, continues loop (and url comes around though i've deleted queue...)
while it's running in terminal, if ctrl+c , force end, enters parse()
method , crawls page, ends. i'm not sure what's wrong here.
class my_spider(spider): name = "my_spider" allowed_domains = ['domain.com'] def __init__(self): super(my_spider, self).__init__() self.url = none def start_requests(self): while true: # crawl url queue yield self.make_requests_from_url(self._pop_queue()) def _pop_queue(self): # grab url queue return self.queue() def queue(self): url = none while url none: conf = { "sqs-access-key": "", "sqs-secret-key": "", "sqs-queue-name": "crawler", "sqs-region": "us-east-1", "sqs-path": "sqssend" } # connect aws conn = boto.sqs.connect_to_region( conf.get('sqs-region'), aws_access_key_id=conf.get('sqs-access-key'), aws_secret_access_key=conf.get('sqs-secret-key') ) q = conn.get_queue(conf.get('sqs-queue-name')) message = conn.receive_message(q) # didn't message back, wait. if not message: time.sleep(10) url = none else: url = message if url not none: message = url[0] message_body = str(message.get_body()) message.delete() self.url = message_body return self.url def parse(self, response): ... yield item
updated comments:
def start_requests(self): while true: # crawl url queue queue = self._pop_queue() self.logger.error(queue) if queue none: time.sleep(10) continue url = queue if url: yield self.make_requests_from_url(url)
removed while url none:
loop, still same problem.
would right assume if works:
import scrapy import random class examplespider(scrapy.spider): name = "example" allowed_domains = ["example.com"] def __init__(self): super(examplespider, self).__init__() self.url = none def start_requests(self): while true: # crawl url queue yield self.make_requests_from_url(self._pop_queue()) def _pop_queue(self): # grab url queue return self.queue() def queue(self): return 'http://www.example.com/?{}'.format(random.randint(0,100000)) def parse(self, response): print "successfully parsed!"
then code should work well, unless:
- there's problem
allowed_domains
, queue returns urls outside it - there's problem
queue()
function and/or data produces e.g. returns arrays or blocks indefinitely or that
note boto
library blocking , not twisted/asynchronous. in order not block scrapy while using it, have use twisted-compatible library txsqs. alternatively might want run boto
calls in separate thread defertothread.
after follow question in scrapy list, believe have understand code quite far functional , makes as generic boto/sqs question scrapy question. anyway - here's average functional solution.
i've created , aws sqs properties:
then gave overly broad permissions:
now i'm able submit messages in queue aws cli this:
$ aws --region eu-west-1 sqs send-message --queue-url "https://sqs.eu-west-1.amazonaws.com/123412341234/my_queue" --message-body 'url:https://stackoverflow.com'
for weird reason - think when setting --message-body
url downloading page , sending result message body(!) not sure - don't have time confirm this, interesting. anyway.
here's proper'ish spider code. said before, boto
blocking api bad. in implementation call api once start_requests()
, when spider idle
on spider_idle()
callback. @ point, because spider idle
, fact boto
blocking doesn't pose of problem. while pull urls sqs, pull many possible while
loop (you put limit there if don't want consume e.g. more 500 @ time) in order have call blocking api possible. notice call conn.delete_message_batch()
removes messages queue (otherwise stay there ever) , queue.set_message_class(boto.sqs.message.rawmessage)
avoids this problem.
overall might ok solution level of requirements.
from scrapy import spider, request scrapy import signals import boto.sqs scrapy.exceptions import dontclosespider class cpu_z(spider): name = "cpuz" allowed_domains = ['http://valid.x86.fr'] @classmethod def from_crawler(cls, crawler, *args, **kwargs): spider = super(cpu_z, cls).from_crawler(crawler, *args, **kwargs) crawler.signals.connect(spider.spider_idle, signal=signals.spider_idle) return spider def __init__(self, *args, **kwargs): super(cpu_z, self).__init__(*args, **kwargs) conf = { "sqs-access-key": "ak????????????????", "sqs-secret-key": "ab????????????????????????????????", "sqs-queue-name": "my_queue", "sqs-region": "eu-west-1", } self.conn = boto.sqs.connect_to_region( conf.get('sqs-region'), aws_access_key_id=conf.get('sqs-access-key'), aws_secret_access_key=conf.get('sqs-secret-key') ) self.queue = self.conn.get_queue(conf.get('sqs-queue-name')) assert self.queue self.queue.set_message_class(boto.sqs.message.rawmessage) def _get_some_urs_from_sqs(self): while true: messages = self.conn.receive_message(self.queue, number_messages=10) if not messages: break message in messages: body = message.get_body() if body[:4] == 'url:': url = body[4:] yield self.make_requests_from_url(url) self.conn.delete_message_batch(self.queue, messages) def spider_idle(self, spider): request in self._get_some_urs_from_sqs(): self.crawler.engine.crawl(request, self) raise dontclosespider() def start_requests(self): request in self._get_some_urs_from_sqs(): yield request def parse(self, response): yield { "freq_clock": response.url }
Comments
Post a Comment