簡體   English   中英

gevent池卡住了

[英]gevent pool getting stuck

我是新手,但是在一定程度上,我認為我確實可以使用。 基本上,對於1個池,代碼會繼續執行,而對於較大的池,代碼會卡住,通常在第一個池內(例如,對於5個池,我看到3個greenlet精加工,但不超過3個)。 怎么了? 產卵? 加入?

我無法驗證遠程服務器是否被多個查詢弄糊塗了,但是它對串行請求的快速順序沒有問題,所以可能不會……

(由於不確定該錯誤在哪里,我將全部共享代碼。感謝您的支持。)

from urllib2 import urlopen
from lxml.etree import parse
import os, csv, cStringIO, codecs, pickle
from selenium import webdriver
from time import sleep
import gevent
from gevent import socket
from gevent import monkey, pool
# patches stdlib (including socket and ssl modules) to cooperate with other greenlets
monkey.patch_all()


class UnicodeWriter:
    """
    A CSV writer which will write rows to CSV file "f",
    which is encoded in the given encoding.
    """

    def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
        # Redirect output to a queue
        self.queue = cStringIO.StringIO()
        self.writer = csv.writer(self.queue, dialect=dialect, **kwds)
        self.stream = f
        self.encoder = codecs.getincrementalencoder(encoding)()

    def writerow(self, row):
        self.writer.writerow([unicode(s).encode("utf-8") for s in row])
        # Fetch UTF-8 output from the queue ...
        data = self.queue.getvalue()
        data = data.decode("utf-8")
        # ... and reencode it into the target encoding
        data = self.encoder.encode(data)
        # write to the target stream
        self.stream.write(data)
        # empty queue
        self.queue.truncate(0)

    def writerows(self, rows):
        for row in rows:
            self.writerow(row)

os.chdir('/Users/laszlosandor/Downloads/kozbeszerzes')

HOSTNAME = 'http://kozbeszerzes.ceu.hu'

driver = webdriver.Chrome()
results = set()

for y in xrange(1998,2015):
    for p in xrange(0,9999):
        driver.get('http://kozbeszerzes.ceu.hu/searchresults.xhtml?q={}&page={}'.format(y,p))
        sleep(1)
        if len(driver.find_elements_by_class_name('result'))==0:
            break
        for e in driver.find_elements_by_class_name('result'):
            link = e.find_element_by_tag_name('a')
            r = link.get_attribute('href').encode('ascii', 'ignore')
            if r[:34]== 'http://kozbeszerzes.ceu.hu/tender/':
                results.add(r)
driver.quit()

with open('list_of_urls', 'wb') as f:
    pickle.dump(results, f)
#with open('list_of_urls', 'r') as f:
#     results = pickle.load(f)

entities = set()

header = ('TenderID','RequestorName','URL','Year','RequestorID','Subject','SourceURL','EstValue','Currency','DecisionDate','Value','VAT')

# """Spawn multiple workers and wait for them to complete"""
# # limit ourselves to max 10 simultaneous outstanding requests
p = pool.Pool(10)

f = open('tenders.csv', 'w')
f.write(codecs.BOM_UTF8)
writer = UnicodeWriter(f)
writer.writerow(header)

def workres(res):
    try:
        tender = parse(urlopen(res)).getroot()
        print ('%s succeeded' % res)
        for requestor in tender.findall('requestor'):
            entities.add(HOSTNAME + requestor.get('url'))
        id = tender.get('id')
        reqname = tender.get('requestor')
        url = tender.get('url')
        year =  tender.get('year')
        reqid = tender.get('requestor_id')
        subject = tender.get('subject')
        source = tender.get('source_url')
        estval = tender.get('estimated_value')
        for part in tender.findall('./parts/part'):
            winner = part.find('winner')
            entities.add(HOSTNAME + winner.get('url'))
            curr = part.find('currency').text
            date = part.find('decisionDate').text
            value = part.find('value').text
            vat = part.find('vat').text
            row = id, reqname, url, year, reqid, subject, source, estval, curr, date, value, vat
            writer.writerow(row)
    except socket.gaierror:
        ex = sys.exc_info()[1]
        print ('%s failed with %s' % (res, ex))

jobs = [p.spawn(workres, res) for res in results]
p.join()

f.close()

with open('entities', 'wb') as f:
     pickle.dump(entities, f)

header = ['ID','URL','Name','NominalCity','City', 'ZIP', 'Address']

f = open('entities.csv', 'w')
f.write(codecs.BOM_UTF8)
writer = UnicodeWriter(f)
writer.writerow(header)

def workent(ent):
    try:
        ent = parse(urlopen(ent)).getroot()
        print ('%s succeeded' % ent)
        id = ent.get('id')
        url = ent.get('url')
        name = ent.get('name')
        nominalcity = ent.get('city')
        cities = ent.findall('./resolved_addresses/whitelistAddress/city')
        zips = ent.findall('./resolved_addresses/whitelistAddress/postalCode')
        streets = ent.findall('./resolved_addresses/whitelistAddress/street')
        for a in xrange(0,len(cities)):
            city = cities[a].text
            zip = zips[a].text
            street = streets[a].text
            row = id, url, name, nominalcity, city, zip, street
            writer.writerow(row)
    except socket.gaierror:
        ex = sys.exc_info()[1]
        print ('%s failed with %s' % (ent, ex))

jobs = [p.spawn(workent, ent) for ent in entities]
p.join()

f.close()

我在這里看到很多錯誤。

  • 還有不使用gevent.sleep()和不time.sleep被阻斷。
  • 您的變量名稱太短。 您可以添加有關代碼的每個部分應該做什么的描述。 例如,變量“ p”被使用兩次。
  • 使用urlopen和驅動程序模塊獲取多個URL? 混亂..
  • 我將在不同工作線程之間使用隊列,並且只有一個工作線程可以執行write_row調用並處理文件訪問,現在您可以通過多個綠色按鈕訪問同一文件。
  • 使用較少的列表能力,只需寫出循環即可。

  • 我建議僅在“ parses(urlopen())”代碼周圍的“ workres”中進行嘗試,否則可能會發生更多的異常,而您現在看不到。

gevent的更多提示

暫無
暫無

聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.

 
粵ICP備18138465號  © 2020-2024 STACKOOM.COM