简体   繁体   中英

Python multiprocessing.Process can not stop when after connecting the network

When I try to crawl thesis information in multiple threads, I cannot close the process after getting the information:

error And when I comment the code which function is get the information from.network, these processes can end normally. normal This error is trouble me and I don't have any idea, my.network connect is by requests and set the response.close() so can any handsome brother or beautiful lady help this confused person? Thanks

This is whole code: my python is python 3.7


from multiprocessing import Process, Queue, Pool,Manager,Value
import time, random
import requests
import re
from bs4 import BeautifulSoup

headers = {
    'user-agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.129 Safari/537.36,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36",
    'Connection': 'close'
}

## Just get the html text
def GetUrlInfo(url):
    response = requests.get(url=url, headers=headers)
    response.encoding = 'utf-8'
    response.close()
    SoupData = BeautifulSoup(response.text, 'lxml')
    return SoupData

def GetVolumeUrlfromUrl(url:str)->str:
    """input is Journal's url and output is a link and a text description to each issue of the journal"""
    url = re.sub('http:', 'https:', url)
    SoupDataTemp = GetUrlInfo(url+'index.html')
    SoupData = SoupDataTemp.find_all('li')
    UrlALL = []
    for i in SoupData:
        if i.find('a') != None:
            volumeUrlRule = '<a href=\"(.*?)\">(.*?)</a>'
            volumeUrlTemp = re.findall(volumeUrlRule,str(i),re.I)
            # u = i.find('a')['href']
            # # print(u)
            for u in volumeUrlTemp:
                if re.findall(url, u[0]):
                    # print(u)
                    UrlALL.append((u[0], u[1]), )
    # print(UrlALL)
    return UrlALL

def GetPaperBaseInfoFromUrlAll(url:str)->str:
    """The input is the url and the output is all the paper information obtained from the web page,
    including, doi, title, author, and the date about this volume """
    soup = GetUrlInfo(url)
    temp1 = soup.find_all('li',class_='entry article')
    temp2= soup.find_all('h2')
    temp2=re.sub('\\n',' ',temp2[1].text)
    # print(temp2)
    volumeYear = re.split(' ',temp2)[-1]
    paper = []
    for i in temp1:
        if i.find('div',class_='head').find('a')== None:
            paperDoi = ''
        else:
            paperDoi = i.find('div',class_='head').find('a')['href']
        title = i.find('cite').find('span',class_='title').text[:-2]
        paper.append([paperDoi,title])
    return paper,volumeYear


# test start
url = 'http://dblp.uni-trier.de/db/journals/talg/'
UrlALL = GetVolumeUrlfromUrl(url)
UrlLen = len(UrlALL)

# put the url into the query
def Write(query,value,num):
    for count in range(num):
        query.put(value[count][0],True)
        # time.sleep(random.random())
    print('write end')

# from the query get the url and get the paper info with this url
def Read(query,num,PaperInfo1,COUNT,i,paperNumber):
    while True:
        count = COUNT.get(True)
        # print("before enter" + str(i) + ' - ' + str(count)+' - '+str(num))
        COUNT.put(count, True)
        if not query.empty():
            value = query.get(True)
            count = COUNT.get(True)
            count = count + 1
            COUNT.put(count,True)
            paper, thisYear = GetPaperBaseInfoFromUrlAll(value) # just commented
            print("connected " + str(i) + ' - ' + str(count) + ' - ' + str(num))
            numb = paperNumber.get(True)
            numb = numb + len(paper)
            paperNumber.put(numb) # just commented
            # print(paper,thisYear)
            PaperInfo1.put((paper,thisYear),) # just commented
            print("the process "+str(i)+' - '+ str(count)+ ' : '+value)
        if not COUNT.empty():
            count = COUNT.get(True)
            # print("after enter" + str(i) + ' - ' + str(count) + ' - ' + str(num))
            COUNT.put(count,True)
            if int(count) == int(num):
                print("the process "+str(i)+" end ")
                break
    print('read end')

# print the paper info
def GetPaperInfo(PaperInfo1,paperNumber):
    for i in range(paperNumber.get(True)):
            value = PaperInfo1.get(True)
            print(value)

if __name__=='__main__':

    r_num = 10 # th read process number
    w_num = 1 # th write process number
    w_cnt = UrlLen # the write counter
    q = Queue(UrlLen) # the volune url queue
    paperNumber = Queue(1) # the all paper number
    COUNT = Queue(1) # the end tag
    COUNT.put(int(0)) # first is zero
    paperNumber.put(int(0)) # first is zero
    PaperInfo1 = Queue()
    r_list = [Process( target=Read, args=(q,w_cnt,PaperInfo1,COUNT,i,paperNumber) ) for i in range(r_num)]
    w_list = [Process( target=Write, args=(q,UrlALL,w_cnt) )]

    time_start = time.time()
    [task.start() for task in w_list]
    [task.start() for task in r_list]

    [task.join() for task in w_list]
    [task.join() for task in r_list]

    time_used = time.time() - time_start
    GetPaperInfo(PaperInfo1, paperNumber)
    print('time_used:{}s'.format(time_used))


I have no idea, with debug the process finally enter the process.py -> row:297: try: self.run() and then enter the row:300: util._exit_function() and just a connected the debug but I dont know why the.network can cause this error and how to solve this that's all Thank you!

Hi,this is me again,I tried a concurrent implementation of threads ,and global variables for threads are much more comfortable than process queue data sharing. By thread it does implement but my main function can't be stopped , previously with processes it was not possible to proceed to the next step when fetching concurrently, the fetching of data was implemented through threads and continued in the main function but the main function can't be stopped anymore . How interesting!

I have designed three functions similar to the previous ones.

GetUrlintoQueue is to write the fetched url UrlALL to the queue UrlQueue , UrlLen is the number of the url.

import threading
import queue

count = 0 # Record the number of times a value is fetched from the queue
paperNumber = 0 # Record the number of papers

def GetUrlintoQueue(UrlQueue,UrlALL,UrlLen):
    for index in range(UrlLen):
        UrlQueue.put(UrlALL[index][0], True)
    print('Write End')
    UrlQueue.task_done()

The other is GetPaperInfofromUrl . Get the url from the UrlQueue and write the information of the corresponding page to PaperInfo , index is the thread number.

def GetPaperInfofromUrl(UrlQueue,PaperInfo,index,UrlLen):
    global count,paperNumber
    while True:
        if not UrlQueue.empty():
            url = UrlQueue.get(True)
            count = count + 1
            paper, thisYear = GetPaperBaseInfoFromUrlAll(url)  # just commented
            print("connected " + str(index) + '-nd - ' + str(count) + ' - ' + str(UrlLen))
            print(paper,thisYear)
            paperNumber = paperNumber + len(paper)
            PaperInfo.put((paper, thisYear), True)
        if count == UrlLen:
            print("the process " + str(index) + " end ")
            break
    UrlQueue.task_done()
    PaperInfo.task_done()
    print('the process ' + str(index) +' get paper info end')

GetPaperInfo is to show the results about PaperInfo , and it don't change.

def GetPaperInfo(PaperInfo,paperNumber):
    for i in range(paperNumber):
            value = PaperInfo.get(True)
            print(value)

The main function first sets the corresponding variables, then writes directly first, then 10 threads crawl paper information, and finally shows the results, but after displaying the results still can not exit , I can not understand why.

if __name__ == '__main__':
    url = 'http://dblp.uni-trier.de/db/journals/talg/'
    UrlALL = GetVolumeUrlfromUrl(url)
    UrlLen = len(UrlALL)
    UrlQueue = queue.Queue(UrlLen)
    PaperInfo = queue.Queue(1000)
    WriteThread = 1
    ReadThread = 10

    # url write
    GetUrlThread = [threading.Thread(target=GetUrlintoQueue, args=(UrlQueue,UrlALL,UrlLen,))]
    time_start = time.time()
    [geturl.start() for geturl in GetUrlThread]
    [geturl.join() for geturl in GetUrlThread]
    time_used = time.time() - time_start
    print('time_used:{}s'.format(time_used))
    # url write end

    # paperinfo get
    PaperinfoGetThread = [threading.Thread(target=GetPaperInfofromUrl, args=(UrlQueue,PaperInfo,index,UrlLen,)) for index in range(ReadThread)]
    time_start = time.time()
    [getpaper.start() for getpaper in PaperinfoGetThread]
    [getpaper.join() for getpaper in PaperinfoGetThread]
    time_used = time.time() - time_start
    print('time_used:{}s'.format(time_used))
    # paperinfo get end
    
    GetPaperInfo(PaperInfo,paperNumber) # show the results
    import sys # it does not work 
    sys.exit()

The debug shows: debug.gif (I dont have 10 reputation so the picture is the type of link. )

Here is how your process might look using concurrent.futures to manage all the threads and data transport. (not tested) Adapting an example in the documentation .

from concurrent.futures import ThreadPoolExecutor 

def GetPaperInfofromUrl(index,url):
    paper, thisYear = GetPaperBaseInfoFromUrlAll(url)
    return (index,url,paper,thisYear)

if __name__ == "__main__":
    url = 'http://dblp.uni-trier.de/db/journals/talg/'
    urls,descr = zip(*GetVolumeUrlfromUrl(url))
    results = []

    with ThreadPoolExecutor(max_workers=10) as executor:
        futs = [executor.submit(GetPaperInfofromUrl, index,url) for index,url in enumerate(urls)]
        for future in concurrent.futures.as_completed(futs):
            results.append(future.result())

GetPaperInfofromUrl seems superfluous, you could probably refactor GetPaperBaseInfoFromUrlAll and avoid a function call.

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM