简体   繁体   English

Python multiprocessing.Process cannot stop when after connecting the.network

[英]Python multiprocessing.Process can not stop when after connecting the network

When I try to crawl thesis information in multiple threads, I cannot close the process after getting the information:多线程爬取论文信息时,获取信息后无法关闭进程:

error And when I comment the code which function is get the information from.network, these processes can end normally. error当我注释掉function是从.network获取信息的代码时,这些进程可以正常结束。 normal This error is trouble me and I don't have any idea, my.network connect is by requests and set the response.close() so can any handsome brother or beautiful lady help this confused person? Thanks normal这个错误很困扰我,我不知道,我的.network connect 是通过requests和set response.close()有没有帅哥或者美女帮帮这个迷茫的人? 谢谢

This is whole code: my python is python 3.7这是完整的代码:我的 python 是 python 3.7


from multiprocessing import Process, Queue, Pool,Manager,Value
import time, random
import requests
import re
from bs4 import BeautifulSoup

headers = {
    'user-agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.129 Safari/537.36,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36",
    'Connection': 'close'
}

## Just get the html text
def GetUrlInfo(url):
    response = requests.get(url=url, headers=headers)
    response.encoding = 'utf-8'
    response.close()
    SoupData = BeautifulSoup(response.text, 'lxml')
    return SoupData

def GetVolumeUrlfromUrl(url:str)->str:
    """input is Journal's url and output is a link and a text description to each issue of the journal"""
    url = re.sub('http:', 'https:', url)
    SoupDataTemp = GetUrlInfo(url+'index.html')
    SoupData = SoupDataTemp.find_all('li')
    UrlALL = []
    for i in SoupData:
        if i.find('a') != None:
            volumeUrlRule = '<a href=\"(.*?)\">(.*?)</a>'
            volumeUrlTemp = re.findall(volumeUrlRule,str(i),re.I)
            # u = i.find('a')['href']
            # # print(u)
            for u in volumeUrlTemp:
                if re.findall(url, u[0]):
                    # print(u)
                    UrlALL.append((u[0], u[1]), )
    # print(UrlALL)
    return UrlALL

def GetPaperBaseInfoFromUrlAll(url:str)->str:
    """The input is the url and the output is all the paper information obtained from the web page,
    including, doi, title, author, and the date about this volume """
    soup = GetUrlInfo(url)
    temp1 = soup.find_all('li',class_='entry article')
    temp2= soup.find_all('h2')
    temp2=re.sub('\\n',' ',temp2[1].text)
    # print(temp2)
    volumeYear = re.split(' ',temp2)[-1]
    paper = []
    for i in temp1:
        if i.find('div',class_='head').find('a')== None:
            paperDoi = ''
        else:
            paperDoi = i.find('div',class_='head').find('a')['href']
        title = i.find('cite').find('span',class_='title').text[:-2]
        paper.append([paperDoi,title])
    return paper,volumeYear


# test start
url = 'http://dblp.uni-trier.de/db/journals/talg/'
UrlALL = GetVolumeUrlfromUrl(url)
UrlLen = len(UrlALL)

# put the url into the query
def Write(query,value,num):
    for count in range(num):
        query.put(value[count][0],True)
        # time.sleep(random.random())
    print('write end')

# from the query get the url and get the paper info with this url
def Read(query,num,PaperInfo1,COUNT,i,paperNumber):
    while True:
        count = COUNT.get(True)
        # print("before enter" + str(i) + ' - ' + str(count)+' - '+str(num))
        COUNT.put(count, True)
        if not query.empty():
            value = query.get(True)
            count = COUNT.get(True)
            count = count + 1
            COUNT.put(count,True)
            paper, thisYear = GetPaperBaseInfoFromUrlAll(value) # just commented
            print("connected " + str(i) + ' - ' + str(count) + ' - ' + str(num))
            numb = paperNumber.get(True)
            numb = numb + len(paper)
            paperNumber.put(numb) # just commented
            # print(paper,thisYear)
            PaperInfo1.put((paper,thisYear),) # just commented
            print("the process "+str(i)+' - '+ str(count)+ ' : '+value)
        if not COUNT.empty():
            count = COUNT.get(True)
            # print("after enter" + str(i) + ' - ' + str(count) + ' - ' + str(num))
            COUNT.put(count,True)
            if int(count) == int(num):
                print("the process "+str(i)+" end ")
                break
    print('read end')

# print the paper info
def GetPaperInfo(PaperInfo1,paperNumber):
    for i in range(paperNumber.get(True)):
            value = PaperInfo1.get(True)
            print(value)

if __name__=='__main__':

    r_num = 10 # th read process number
    w_num = 1 # th write process number
    w_cnt = UrlLen # the write counter
    q = Queue(UrlLen) # the volune url queue
    paperNumber = Queue(1) # the all paper number
    COUNT = Queue(1) # the end tag
    COUNT.put(int(0)) # first is zero
    paperNumber.put(int(0)) # first is zero
    PaperInfo1 = Queue()
    r_list = [Process( target=Read, args=(q,w_cnt,PaperInfo1,COUNT,i,paperNumber) ) for i in range(r_num)]
    w_list = [Process( target=Write, args=(q,UrlALL,w_cnt) )]

    time_start = time.time()
    [task.start() for task in w_list]
    [task.start() for task in r_list]

    [task.join() for task in w_list]
    [task.join() for task in r_list]

    time_used = time.time() - time_start
    GetPaperInfo(PaperInfo1, paperNumber)
    print('time_used:{}s'.format(time_used))


I have no idea, with debug the process finally enter the process.py -> row:297: try: self.run() and then enter the row:300: util._exit_function() and just a connected the debug but I dont know why the.network can cause this error and how to solve this that's all Thank you!我不知道,调试过程最终进入process.py -> row:297: try: self.run()然后进入 row:300: util._exit_function()连接调试但我不知道为什么.network 会导致这个错误以及如何解决这个问题就这些了谢谢!

Hi,this is me again,I tried a concurrent implementation of threads ,and global variables for threads are much more comfortable than process queue data sharing.嗨,又是我,我尝试了线程的并发实现,线程global变量比进程队列数据共享舒服多了。 By thread it does implement but my main function can't be stopped , previously with processes it was not possible to proceed to the next step when fetching concurrently, the fetching of data was implemented through threads and continued in the main function but the main function can't be stopped anymore .通过线程它确实实现了但我的主 function 无法停止,以前使用进程在并发获取时无法继续下一步fetching of data was implemented through threads and continued in the main functionthe main function can't be stopped anymore How interesting!多么有趣!

I have designed three functions similar to the previous ones.我设计了三个与前面类似的功能。

GetUrlintoQueue is to write the fetched url UrlALL to the queue UrlQueue , UrlLen is the number of the url. GetUrlintoQueue是将抓取到的url 的UrlALL到队列UrlQueue中, UrlLen是url 的编号。

import threading
import queue

count = 0 # Record the number of times a value is fetched from the queue
paperNumber = 0 # Record the number of papers

def GetUrlintoQueue(UrlQueue,UrlALL,UrlLen):
    for index in range(UrlLen):
        UrlQueue.put(UrlALL[index][0], True)
    print('Write End')
    UrlQueue.task_done()

The other is GetPaperInfofromUrl .另一个是GetPaperInfofromUrl Get the url from the UrlQueue and write the information of the corresponding page to PaperInfo , index is the thread number.从 UrlQueue 中获取UrlQueue并将对应页面的信息写入PaperInfoindex为线程号。

def GetPaperInfofromUrl(UrlQueue,PaperInfo,index,UrlLen):
    global count,paperNumber
    while True:
        if not UrlQueue.empty():
            url = UrlQueue.get(True)
            count = count + 1
            paper, thisYear = GetPaperBaseInfoFromUrlAll(url)  # just commented
            print("connected " + str(index) + '-nd - ' + str(count) + ' - ' + str(UrlLen))
            print(paper,thisYear)
            paperNumber = paperNumber + len(paper)
            PaperInfo.put((paper, thisYear), True)
        if count == UrlLen:
            print("the process " + str(index) + " end ")
            break
    UrlQueue.task_done()
    PaperInfo.task_done()
    print('the process ' + str(index) +' get paper info end')

GetPaperInfo is to show the results about PaperInfo , and it don't change. GetPaperInfo是显示PaperInfo的结果,它不会改变。

def GetPaperInfo(PaperInfo,paperNumber):
    for i in range(paperNumber):
            value = PaperInfo.get(True)
            print(value)

The main function first sets the corresponding variables, then writes directly first, then 10 threads crawl paper information, and finally shows the results, but after displaying the results still can not exit , I can not understand why. main function先设置好相应的变量,然后直接先写入,然后10个线程爬取论文信息,最后显示结果,但是显示结果后还是can not exit ,我也想不明白为什么。

if __name__ == '__main__':
    url = 'http://dblp.uni-trier.de/db/journals/talg/'
    UrlALL = GetVolumeUrlfromUrl(url)
    UrlLen = len(UrlALL)
    UrlQueue = queue.Queue(UrlLen)
    PaperInfo = queue.Queue(1000)
    WriteThread = 1
    ReadThread = 10

    # url write
    GetUrlThread = [threading.Thread(target=GetUrlintoQueue, args=(UrlQueue,UrlALL,UrlLen,))]
    time_start = time.time()
    [geturl.start() for geturl in GetUrlThread]
    [geturl.join() for geturl in GetUrlThread]
    time_used = time.time() - time_start
    print('time_used:{}s'.format(time_used))
    # url write end

    # paperinfo get
    PaperinfoGetThread = [threading.Thread(target=GetPaperInfofromUrl, args=(UrlQueue,PaperInfo,index,UrlLen,)) for index in range(ReadThread)]
    time_start = time.time()
    [getpaper.start() for getpaper in PaperinfoGetThread]
    [getpaper.join() for getpaper in PaperinfoGetThread]
    time_used = time.time() - time_start
    print('time_used:{}s'.format(time_used))
    # paperinfo get end
    
    GetPaperInfo(PaperInfo,paperNumber) # show the results
    import sys # it does not work 
    sys.exit()

The debug shows: debug.gif (I dont have 10 reputation so the picture is the type of link. )调试显示: debug.gif (我没有 10 个信誉所以图片是链接类型。)

Here is how your process might look using concurrent.futures to manage all the threads and data transport.以下是您的进程使用 concurrent.futures 来管理所有线程和数据传输的样子。 (not tested) Adapting an example in the documentation . (未测试)改编文档中的示例

from concurrent.futures import ThreadPoolExecutor 

def GetPaperInfofromUrl(index,url):
    paper, thisYear = GetPaperBaseInfoFromUrlAll(url)
    return (index,url,paper,thisYear)

if __name__ == "__main__":
    url = 'http://dblp.uni-trier.de/db/journals/talg/'
    urls,descr = zip(*GetVolumeUrlfromUrl(url))
    results = []

    with ThreadPoolExecutor(max_workers=10) as executor:
        futs = [executor.submit(GetPaperInfofromUrl, index,url) for index,url in enumerate(urls)]
        for future in concurrent.futures.as_completed(futs):
            results.append(future.result())

GetPaperInfofromUrl seems superfluous, you could probably refactor GetPaperBaseInfoFromUrlAll and avoid a function call. GetPaperInfofromUrl似乎是多余的,您可以重构GetPaperBaseInfoFromUrlAll并避免调用 function。

声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM