简体   繁体   中英

gevent pool getting stuck

I am a gevent newbie, but I think I got it working — in a limited sense. Basically, for pools of 1, the code proceeds, while for larger pools the code gets stuck, usually within the first pool (eg with a pool of 5, I see 3 greenlet finishing, but not more). What is going wrong? Spawn? Join?

I cannot verify whether the remote server gets confused by multiple queries, but it has no problem with a rapid sequence of serial requests, so probably not…

(I share the code in its entirety as I am not sure where the bug is. Thanks for bearing with me.)

from urllib2 import urlopen
from lxml.etree import parse
import os, csv, cStringIO, codecs, pickle
from selenium import webdriver
from time import sleep
import gevent
from gevent import socket
from gevent import monkey, pool
# patches stdlib (including socket and ssl modules) to cooperate with other greenlets
monkey.patch_all()


class UnicodeWriter:
    """
    A CSV writer which will write rows to CSV file "f",
    which is encoded in the given encoding.
    """

    def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
        # Redirect output to a queue
        self.queue = cStringIO.StringIO()
        self.writer = csv.writer(self.queue, dialect=dialect, **kwds)
        self.stream = f
        self.encoder = codecs.getincrementalencoder(encoding)()

    def writerow(self, row):
        self.writer.writerow([unicode(s).encode("utf-8") for s in row])
        # Fetch UTF-8 output from the queue ...
        data = self.queue.getvalue()
        data = data.decode("utf-8")
        # ... and reencode it into the target encoding
        data = self.encoder.encode(data)
        # write to the target stream
        self.stream.write(data)
        # empty queue
        self.queue.truncate(0)

    def writerows(self, rows):
        for row in rows:
            self.writerow(row)

os.chdir('/Users/laszlosandor/Downloads/kozbeszerzes')

HOSTNAME = 'http://kozbeszerzes.ceu.hu'

driver = webdriver.Chrome()
results = set()

for y in xrange(1998,2015):
    for p in xrange(0,9999):
        driver.get('http://kozbeszerzes.ceu.hu/searchresults.xhtml?q={}&page={}'.format(y,p))
        sleep(1)
        if len(driver.find_elements_by_class_name('result'))==0:
            break
        for e in driver.find_elements_by_class_name('result'):
            link = e.find_element_by_tag_name('a')
            r = link.get_attribute('href').encode('ascii', 'ignore')
            if r[:34]== 'http://kozbeszerzes.ceu.hu/tender/':
                results.add(r)
driver.quit()

with open('list_of_urls', 'wb') as f:
    pickle.dump(results, f)
#with open('list_of_urls', 'r') as f:
#     results = pickle.load(f)

entities = set()

header = ('TenderID','RequestorName','URL','Year','RequestorID','Subject','SourceURL','EstValue','Currency','DecisionDate','Value','VAT')

# """Spawn multiple workers and wait for them to complete"""
# # limit ourselves to max 10 simultaneous outstanding requests
p = pool.Pool(10)

f = open('tenders.csv', 'w')
f.write(codecs.BOM_UTF8)
writer = UnicodeWriter(f)
writer.writerow(header)

def workres(res):
    try:
        tender = parse(urlopen(res)).getroot()
        print ('%s succeeded' % res)
        for requestor in tender.findall('requestor'):
            entities.add(HOSTNAME + requestor.get('url'))
        id = tender.get('id')
        reqname = tender.get('requestor')
        url = tender.get('url')
        year =  tender.get('year')
        reqid = tender.get('requestor_id')
        subject = tender.get('subject')
        source = tender.get('source_url')
        estval = tender.get('estimated_value')
        for part in tender.findall('./parts/part'):
            winner = part.find('winner')
            entities.add(HOSTNAME + winner.get('url'))
            curr = part.find('currency').text
            date = part.find('decisionDate').text
            value = part.find('value').text
            vat = part.find('vat').text
            row = id, reqname, url, year, reqid, subject, source, estval, curr, date, value, vat
            writer.writerow(row)
    except socket.gaierror:
        ex = sys.exc_info()[1]
        print ('%s failed with %s' % (res, ex))

jobs = [p.spawn(workres, res) for res in results]
p.join()

f.close()

with open('entities', 'wb') as f:
     pickle.dump(entities, f)

header = ['ID','URL','Name','NominalCity','City', 'ZIP', 'Address']

f = open('entities.csv', 'w')
f.write(codecs.BOM_UTF8)
writer = UnicodeWriter(f)
writer.writerow(header)

def workent(ent):
    try:
        ent = parse(urlopen(ent)).getroot()
        print ('%s succeeded' % ent)
        id = ent.get('id')
        url = ent.get('url')
        name = ent.get('name')
        nominalcity = ent.get('city')
        cities = ent.findall('./resolved_addresses/whitelistAddress/city')
        zips = ent.findall('./resolved_addresses/whitelistAddress/postalCode')
        streets = ent.findall('./resolved_addresses/whitelistAddress/street')
        for a in xrange(0,len(cities)):
            city = cities[a].text
            zip = zips[a].text
            street = streets[a].text
            row = id, url, name, nominalcity, city, zip, street
            writer.writerow(row)
    except socket.gaierror:
        ex = sys.exc_info()[1]
        print ('%s failed with %s' % (ent, ex))

jobs = [p.spawn(workent, ent) for ent in entities]
p.join()

f.close()

I see many mistakes here.

  • There is not used gevent.sleep() and not time.sleep which is blocking.
  • Your variables names are too short. Your could add descriptions on what each part of code is supposed to do. for example the variable 'p' is used twice..
  • There are multiple urls gets using urlopen and the driver module? confusing..
  • I would use queues between different workers and have just one worker do write_row calls and deal with the file access now you have multiple green lets accessing the same file..
  • use less list compehensions just write out the loops.

  • I would suggest putting the try except in 'workres' only around the 'parse(urlopen())' code maybe there are more exceptions happening, which you now don't see.

more tips for gevent

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM