繁体   English   中英

Python multiprocessing.Process cannot stop when after connecting the.network

[英]Python multiprocessing.Process can not stop when after connecting the network

多线程爬取论文信息时,获取信息后无法关闭进程:

error当我注释掉function是从.network获取信息的代码时,这些进程可以正常结束。 normal这个错误很困扰我,我不知道,我的.network connect 是通过requests和set response.close()有没有帅哥或者美女帮帮这个迷茫的人? 谢谢

这是完整的代码:我的 python 是 python 3.7


from multiprocessing import Process, Queue, Pool,Manager,Value
import time, random
import requests
import re
from bs4 import BeautifulSoup

headers = {
    'user-agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.129 Safari/537.36,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36",
    'Connection': 'close'
}

## Just get the html text
def GetUrlInfo(url):
    response = requests.get(url=url, headers=headers)
    response.encoding = 'utf-8'
    response.close()
    SoupData = BeautifulSoup(response.text, 'lxml')
    return SoupData

def GetVolumeUrlfromUrl(url:str)->str:
    """input is Journal's url and output is a link and a text description to each issue of the journal"""
    url = re.sub('http:', 'https:', url)
    SoupDataTemp = GetUrlInfo(url+'index.html')
    SoupData = SoupDataTemp.find_all('li')
    UrlALL = []
    for i in SoupData:
        if i.find('a') != None:
            volumeUrlRule = '<a href=\"(.*?)\">(.*?)</a>'
            volumeUrlTemp = re.findall(volumeUrlRule,str(i),re.I)
            # u = i.find('a')['href']
            # # print(u)
            for u in volumeUrlTemp:
                if re.findall(url, u[0]):
                    # print(u)
                    UrlALL.append((u[0], u[1]), )
    # print(UrlALL)
    return UrlALL

def GetPaperBaseInfoFromUrlAll(url:str)->str:
    """The input is the url and the output is all the paper information obtained from the web page,
    including, doi, title, author, and the date about this volume """
    soup = GetUrlInfo(url)
    temp1 = soup.find_all('li',class_='entry article')
    temp2= soup.find_all('h2')
    temp2=re.sub('\\n',' ',temp2[1].text)
    # print(temp2)
    volumeYear = re.split(' ',temp2)[-1]
    paper = []
    for i in temp1:
        if i.find('div',class_='head').find('a')== None:
            paperDoi = ''
        else:
            paperDoi = i.find('div',class_='head').find('a')['href']
        title = i.find('cite').find('span',class_='title').text[:-2]
        paper.append([paperDoi,title])
    return paper,volumeYear


# test start
url = 'http://dblp.uni-trier.de/db/journals/talg/'
UrlALL = GetVolumeUrlfromUrl(url)
UrlLen = len(UrlALL)

# put the url into the query
def Write(query,value,num):
    for count in range(num):
        query.put(value[count][0],True)
        # time.sleep(random.random())
    print('write end')

# from the query get the url and get the paper info with this url
def Read(query,num,PaperInfo1,COUNT,i,paperNumber):
    while True:
        count = COUNT.get(True)
        # print("before enter" + str(i) + ' - ' + str(count)+' - '+str(num))
        COUNT.put(count, True)
        if not query.empty():
            value = query.get(True)
            count = COUNT.get(True)
            count = count + 1
            COUNT.put(count,True)
            paper, thisYear = GetPaperBaseInfoFromUrlAll(value) # just commented
            print("connected " + str(i) + ' - ' + str(count) + ' - ' + str(num))
            numb = paperNumber.get(True)
            numb = numb + len(paper)
            paperNumber.put(numb) # just commented
            # print(paper,thisYear)
            PaperInfo1.put((paper,thisYear),) # just commented
            print("the process "+str(i)+' - '+ str(count)+ ' : '+value)
        if not COUNT.empty():
            count = COUNT.get(True)
            # print("after enter" + str(i) + ' - ' + str(count) + ' - ' + str(num))
            COUNT.put(count,True)
            if int(count) == int(num):
                print("the process "+str(i)+" end ")
                break
    print('read end')

# print the paper info
def GetPaperInfo(PaperInfo1,paperNumber):
    for i in range(paperNumber.get(True)):
            value = PaperInfo1.get(True)
            print(value)

if __name__=='__main__':

    r_num = 10 # th read process number
    w_num = 1 # th write process number
    w_cnt = UrlLen # the write counter
    q = Queue(UrlLen) # the volune url queue
    paperNumber = Queue(1) # the all paper number
    COUNT = Queue(1) # the end tag
    COUNT.put(int(0)) # first is zero
    paperNumber.put(int(0)) # first is zero
    PaperInfo1 = Queue()
    r_list = [Process( target=Read, args=(q,w_cnt,PaperInfo1,COUNT,i,paperNumber) ) for i in range(r_num)]
    w_list = [Process( target=Write, args=(q,UrlALL,w_cnt) )]

    time_start = time.time()
    [task.start() for task in w_list]
    [task.start() for task in r_list]

    [task.join() for task in w_list]
    [task.join() for task in r_list]

    time_used = time.time() - time_start
    GetPaperInfo(PaperInfo1, paperNumber)
    print('time_used:{}s'.format(time_used))


我不知道,调试过程最终进入process.py -> row:297: try: self.run()然后进入 row:300: util._exit_function()连接调试但我不知道为什么.network 会导致这个错误以及如何解决这个问题就这些了谢谢!

嗨,又是我,我尝试了线程的并发实现,线程global变量比进程队列数据共享舒服多了。 通过线程它确实实现了但我的主 function 无法停止,以前使用进程在并发获取时无法继续下一步fetching of data was implemented through threads and continued in the main functionthe main function can't be stopped anymore 多么有趣!

我设计了三个与前面类似的功能。

GetUrlintoQueue是将抓取到的url 的UrlALL到队列UrlQueue中, UrlLen是url 的编号。

import threading
import queue

count = 0 # Record the number of times a value is fetched from the queue
paperNumber = 0 # Record the number of papers

def GetUrlintoQueue(UrlQueue,UrlALL,UrlLen):
    for index in range(UrlLen):
        UrlQueue.put(UrlALL[index][0], True)
    print('Write End')
    UrlQueue.task_done()

另一个是GetPaperInfofromUrl 从 UrlQueue 中获取UrlQueue并将对应页面的信息写入PaperInfoindex为线程号。

def GetPaperInfofromUrl(UrlQueue,PaperInfo,index,UrlLen):
    global count,paperNumber
    while True:
        if not UrlQueue.empty():
            url = UrlQueue.get(True)
            count = count + 1
            paper, thisYear = GetPaperBaseInfoFromUrlAll(url)  # just commented
            print("connected " + str(index) + '-nd - ' + str(count) + ' - ' + str(UrlLen))
            print(paper,thisYear)
            paperNumber = paperNumber + len(paper)
            PaperInfo.put((paper, thisYear), True)
        if count == UrlLen:
            print("the process " + str(index) + " end ")
            break
    UrlQueue.task_done()
    PaperInfo.task_done()
    print('the process ' + str(index) +' get paper info end')

GetPaperInfo是显示PaperInfo的结果,它不会改变。

def GetPaperInfo(PaperInfo,paperNumber):
    for i in range(paperNumber):
            value = PaperInfo.get(True)
            print(value)

main function先设置好相应的变量,然后直接先写入,然后10个线程爬取论文信息,最后显示结果,但是显示结果后还是can not exit ,我也想不明白为什么。

if __name__ == '__main__':
    url = 'http://dblp.uni-trier.de/db/journals/talg/'
    UrlALL = GetVolumeUrlfromUrl(url)
    UrlLen = len(UrlALL)
    UrlQueue = queue.Queue(UrlLen)
    PaperInfo = queue.Queue(1000)
    WriteThread = 1
    ReadThread = 10

    # url write
    GetUrlThread = [threading.Thread(target=GetUrlintoQueue, args=(UrlQueue,UrlALL,UrlLen,))]
    time_start = time.time()
    [geturl.start() for geturl in GetUrlThread]
    [geturl.join() for geturl in GetUrlThread]
    time_used = time.time() - time_start
    print('time_used:{}s'.format(time_used))
    # url write end

    # paperinfo get
    PaperinfoGetThread = [threading.Thread(target=GetPaperInfofromUrl, args=(UrlQueue,PaperInfo,index,UrlLen,)) for index in range(ReadThread)]
    time_start = time.time()
    [getpaper.start() for getpaper in PaperinfoGetThread]
    [getpaper.join() for getpaper in PaperinfoGetThread]
    time_used = time.time() - time_start
    print('time_used:{}s'.format(time_used))
    # paperinfo get end
    
    GetPaperInfo(PaperInfo,paperNumber) # show the results
    import sys # it does not work 
    sys.exit()

调试显示: debug.gif (我没有 10 个信誉所以图片是链接类型。)

以下是您的进程使用 concurrent.futures 来管理所有线程和数据传输的样子。 (未测试)改编文档中的示例

from concurrent.futures import ThreadPoolExecutor 

def GetPaperInfofromUrl(index,url):
    paper, thisYear = GetPaperBaseInfoFromUrlAll(url)
    return (index,url,paper,thisYear)

if __name__ == "__main__":
    url = 'http://dblp.uni-trier.de/db/journals/talg/'
    urls,descr = zip(*GetVolumeUrlfromUrl(url))
    results = []

    with ThreadPoolExecutor(max_workers=10) as executor:
        futs = [executor.submit(GetPaperInfofromUrl, index,url) for index,url in enumerate(urls)]
        for future in concurrent.futures.as_completed(futs):
            results.append(future.result())

GetPaperInfofromUrl似乎是多余的,您可以重构GetPaperBaseInfoFromUrlAll并避免调用 function。

暂无
暂无

声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM