I am a gevent newbie, but I think I got it working — in a limited sense. Basically, for pools of 1, the code proceeds, while for larger pools the code gets stuck, usually within the first pool (eg with a pool of 5, I see 3 greenlet finishing, but not more). What is going wrong? Spawn? Join?
I cannot verify whether the remote server gets confused by multiple queries, but it has no problem with a rapid sequence of serial requests, so probably not…
(I share the code in its entirety as I am not sure where the bug is. Thanks for bearing with me.)
from urllib2 import urlopen
from lxml.etree import parse
import os, csv, cStringIO, codecs, pickle
from selenium import webdriver
from time import sleep
import gevent
from gevent import socket
from gevent import monkey, pool
# patches stdlib (including socket and ssl modules) to cooperate with other greenlets
monkey.patch_all()
class UnicodeWriter:
"""
A CSV writer which will write rows to CSV file "f",
which is encoded in the given encoding.
"""
def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
# Redirect output to a queue
self.queue = cStringIO.StringIO()
self.writer = csv.writer(self.queue, dialect=dialect, **kwds)
self.stream = f
self.encoder = codecs.getincrementalencoder(encoding)()
def writerow(self, row):
self.writer.writerow([unicode(s).encode("utf-8") for s in row])
# Fetch UTF-8 output from the queue ...
data = self.queue.getvalue()
data = data.decode("utf-8")
# ... and reencode it into the target encoding
data = self.encoder.encode(data)
# write to the target stream
self.stream.write(data)
# empty queue
self.queue.truncate(0)
def writerows(self, rows):
for row in rows:
self.writerow(row)
os.chdir('/Users/laszlosandor/Downloads/kozbeszerzes')
HOSTNAME = 'http://kozbeszerzes.ceu.hu'
driver = webdriver.Chrome()
results = set()
for y in xrange(1998,2015):
for p in xrange(0,9999):
driver.get('http://kozbeszerzes.ceu.hu/searchresults.xhtml?q={}&page={}'.format(y,p))
sleep(1)
if len(driver.find_elements_by_class_name('result'))==0:
break
for e in driver.find_elements_by_class_name('result'):
link = e.find_element_by_tag_name('a')
r = link.get_attribute('href').encode('ascii', 'ignore')
if r[:34]== 'http://kozbeszerzes.ceu.hu/tender/':
results.add(r)
driver.quit()
with open('list_of_urls', 'wb') as f:
pickle.dump(results, f)
#with open('list_of_urls', 'r') as f:
# results = pickle.load(f)
entities = set()
header = ('TenderID','RequestorName','URL','Year','RequestorID','Subject','SourceURL','EstValue','Currency','DecisionDate','Value','VAT')
# """Spawn multiple workers and wait for them to complete"""
# # limit ourselves to max 10 simultaneous outstanding requests
p = pool.Pool(10)
f = open('tenders.csv', 'w')
f.write(codecs.BOM_UTF8)
writer = UnicodeWriter(f)
writer.writerow(header)
def workres(res):
try:
tender = parse(urlopen(res)).getroot()
print ('%s succeeded' % res)
for requestor in tender.findall('requestor'):
entities.add(HOSTNAME + requestor.get('url'))
id = tender.get('id')
reqname = tender.get('requestor')
url = tender.get('url')
year = tender.get('year')
reqid = tender.get('requestor_id')
subject = tender.get('subject')
source = tender.get('source_url')
estval = tender.get('estimated_value')
for part in tender.findall('./parts/part'):
winner = part.find('winner')
entities.add(HOSTNAME + winner.get('url'))
curr = part.find('currency').text
date = part.find('decisionDate').text
value = part.find('value').text
vat = part.find('vat').text
row = id, reqname, url, year, reqid, subject, source, estval, curr, date, value, vat
writer.writerow(row)
except socket.gaierror:
ex = sys.exc_info()[1]
print ('%s failed with %s' % (res, ex))
jobs = [p.spawn(workres, res) for res in results]
p.join()
f.close()
with open('entities', 'wb') as f:
pickle.dump(entities, f)
header = ['ID','URL','Name','NominalCity','City', 'ZIP', 'Address']
f = open('entities.csv', 'w')
f.write(codecs.BOM_UTF8)
writer = UnicodeWriter(f)
writer.writerow(header)
def workent(ent):
try:
ent = parse(urlopen(ent)).getroot()
print ('%s succeeded' % ent)
id = ent.get('id')
url = ent.get('url')
name = ent.get('name')
nominalcity = ent.get('city')
cities = ent.findall('./resolved_addresses/whitelistAddress/city')
zips = ent.findall('./resolved_addresses/whitelistAddress/postalCode')
streets = ent.findall('./resolved_addresses/whitelistAddress/street')
for a in xrange(0,len(cities)):
city = cities[a].text
zip = zips[a].text
street = streets[a].text
row = id, url, name, nominalcity, city, zip, street
writer.writerow(row)
except socket.gaierror:
ex = sys.exc_info()[1]
print ('%s failed with %s' % (ent, ex))
jobs = [p.spawn(workent, ent) for ent in entities]
p.join()
f.close()
I see many mistakes here.
gevent.sleep()
and not time.sleep
which is blocking. use less list compehensions just write out the loops.
I would suggest putting the try except in 'workres' only around the 'parse(urlopen())' code maybe there are more exceptions happening, which you now don't see.
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.