When I try to crawl thesis information in multiple threads, I cannot close the process after getting the information:
error And when I comment the code which function is get the information from.network, these processes can end normally. normal This error is trouble me and I don't have any idea, my.network connect is by requests
and set the response.close()
so can any handsome brother or beautiful lady help this confused person? Thanks
This is whole code: my python is python 3.7
from multiprocessing import Process, Queue, Pool,Manager,Value
import time, random
import requests
import re
from bs4 import BeautifulSoup
headers = {
'user-agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.129 Safari/537.36,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36",
'Connection': 'close'
}
## Just get the html text
def GetUrlInfo(url):
response = requests.get(url=url, headers=headers)
response.encoding = 'utf-8'
response.close()
SoupData = BeautifulSoup(response.text, 'lxml')
return SoupData
def GetVolumeUrlfromUrl(url:str)->str:
"""input is Journal's url and output is a link and a text description to each issue of the journal"""
url = re.sub('http:', 'https:', url)
SoupDataTemp = GetUrlInfo(url+'index.html')
SoupData = SoupDataTemp.find_all('li')
UrlALL = []
for i in SoupData:
if i.find('a') != None:
volumeUrlRule = '<a href=\"(.*?)\">(.*?)</a>'
volumeUrlTemp = re.findall(volumeUrlRule,str(i),re.I)
# u = i.find('a')['href']
# # print(u)
for u in volumeUrlTemp:
if re.findall(url, u[0]):
# print(u)
UrlALL.append((u[0], u[1]), )
# print(UrlALL)
return UrlALL
def GetPaperBaseInfoFromUrlAll(url:str)->str:
"""The input is the url and the output is all the paper information obtained from the web page,
including, doi, title, author, and the date about this volume """
soup = GetUrlInfo(url)
temp1 = soup.find_all('li',class_='entry article')
temp2= soup.find_all('h2')
temp2=re.sub('\\n',' ',temp2[1].text)
# print(temp2)
volumeYear = re.split(' ',temp2)[-1]
paper = []
for i in temp1:
if i.find('div',class_='head').find('a')== None:
paperDoi = ''
else:
paperDoi = i.find('div',class_='head').find('a')['href']
title = i.find('cite').find('span',class_='title').text[:-2]
paper.append([paperDoi,title])
return paper,volumeYear
# test start
url = 'http://dblp.uni-trier.de/db/journals/talg/'
UrlALL = GetVolumeUrlfromUrl(url)
UrlLen = len(UrlALL)
# put the url into the query
def Write(query,value,num):
for count in range(num):
query.put(value[count][0],True)
# time.sleep(random.random())
print('write end')
# from the query get the url and get the paper info with this url
def Read(query,num,PaperInfo1,COUNT,i,paperNumber):
while True:
count = COUNT.get(True)
# print("before enter" + str(i) + ' - ' + str(count)+' - '+str(num))
COUNT.put(count, True)
if not query.empty():
value = query.get(True)
count = COUNT.get(True)
count = count + 1
COUNT.put(count,True)
paper, thisYear = GetPaperBaseInfoFromUrlAll(value) # just commented
print("connected " + str(i) + ' - ' + str(count) + ' - ' + str(num))
numb = paperNumber.get(True)
numb = numb + len(paper)
paperNumber.put(numb) # just commented
# print(paper,thisYear)
PaperInfo1.put((paper,thisYear),) # just commented
print("the process "+str(i)+' - '+ str(count)+ ' : '+value)
if not COUNT.empty():
count = COUNT.get(True)
# print("after enter" + str(i) + ' - ' + str(count) + ' - ' + str(num))
COUNT.put(count,True)
if int(count) == int(num):
print("the process "+str(i)+" end ")
break
print('read end')
# print the paper info
def GetPaperInfo(PaperInfo1,paperNumber):
for i in range(paperNumber.get(True)):
value = PaperInfo1.get(True)
print(value)
if __name__=='__main__':
r_num = 10 # th read process number
w_num = 1 # th write process number
w_cnt = UrlLen # the write counter
q = Queue(UrlLen) # the volune url queue
paperNumber = Queue(1) # the all paper number
COUNT = Queue(1) # the end tag
COUNT.put(int(0)) # first is zero
paperNumber.put(int(0)) # first is zero
PaperInfo1 = Queue()
r_list = [Process( target=Read, args=(q,w_cnt,PaperInfo1,COUNT,i,paperNumber) ) for i in range(r_num)]
w_list = [Process( target=Write, args=(q,UrlALL,w_cnt) )]
time_start = time.time()
[task.start() for task in w_list]
[task.start() for task in r_list]
[task.join() for task in w_list]
[task.join() for task in r_list]
time_used = time.time() - time_start
GetPaperInfo(PaperInfo1, paperNumber)
print('time_used:{}s'.format(time_used))
I have no idea, with debug the process finally enter the process.py
-> row:297: try: self.run()
and then enter the row:300: util._exit_function()
and just a connected the debug but I dont know why the.network can cause this error and how to solve this that's all Thank you!
Hi,this is me again,I tried a concurrent implementation of threads ,and global
variables for threads are much more comfortable than process queue data sharing. By thread it does implement but my main function can't be stopped , previously with processes it was not possible to proceed to the next step when fetching concurrently, the fetching of data was implemented through threads and continued in the main function
but the main function can't be stopped anymore
. How interesting!
I have designed three functions similar to the previous ones.
GetUrlintoQueue
is to write the fetched url UrlALL
to the queue UrlQueue
, UrlLen
is the number of the url.
import threading
import queue
count = 0 # Record the number of times a value is fetched from the queue
paperNumber = 0 # Record the number of papers
def GetUrlintoQueue(UrlQueue,UrlALL,UrlLen):
for index in range(UrlLen):
UrlQueue.put(UrlALL[index][0], True)
print('Write End')
UrlQueue.task_done()
The other is GetPaperInfofromUrl
. Get the url from the UrlQueue
and write the information of the corresponding page to PaperInfo
, index
is the thread number.
def GetPaperInfofromUrl(UrlQueue,PaperInfo,index,UrlLen):
global count,paperNumber
while True:
if not UrlQueue.empty():
url = UrlQueue.get(True)
count = count + 1
paper, thisYear = GetPaperBaseInfoFromUrlAll(url) # just commented
print("connected " + str(index) + '-nd - ' + str(count) + ' - ' + str(UrlLen))
print(paper,thisYear)
paperNumber = paperNumber + len(paper)
PaperInfo.put((paper, thisYear), True)
if count == UrlLen:
print("the process " + str(index) + " end ")
break
UrlQueue.task_done()
PaperInfo.task_done()
print('the process ' + str(index) +' get paper info end')
GetPaperInfo
is to show the results about PaperInfo
, and it don't change.
def GetPaperInfo(PaperInfo,paperNumber):
for i in range(paperNumber):
value = PaperInfo.get(True)
print(value)
The main function first sets the corresponding variables, then writes directly first, then 10 threads crawl paper information, and finally shows the results, but after displaying the results still can not exit
, I can not understand why.
if __name__ == '__main__':
url = 'http://dblp.uni-trier.de/db/journals/talg/'
UrlALL = GetVolumeUrlfromUrl(url)
UrlLen = len(UrlALL)
UrlQueue = queue.Queue(UrlLen)
PaperInfo = queue.Queue(1000)
WriteThread = 1
ReadThread = 10
# url write
GetUrlThread = [threading.Thread(target=GetUrlintoQueue, args=(UrlQueue,UrlALL,UrlLen,))]
time_start = time.time()
[geturl.start() for geturl in GetUrlThread]
[geturl.join() for geturl in GetUrlThread]
time_used = time.time() - time_start
print('time_used:{}s'.format(time_used))
# url write end
# paperinfo get
PaperinfoGetThread = [threading.Thread(target=GetPaperInfofromUrl, args=(UrlQueue,PaperInfo,index,UrlLen,)) for index in range(ReadThread)]
time_start = time.time()
[getpaper.start() for getpaper in PaperinfoGetThread]
[getpaper.join() for getpaper in PaperinfoGetThread]
time_used = time.time() - time_start
print('time_used:{}s'.format(time_used))
# paperinfo get end
GetPaperInfo(PaperInfo,paperNumber) # show the results
import sys # it does not work
sys.exit()
The debug shows: debug.gif (I dont have 10 reputation so the picture is the type of link. )
Here is how your process might look using concurrent.futures to manage all the threads and data transport. (not tested) Adapting an example in the documentation .
from concurrent.futures import ThreadPoolExecutor
def GetPaperInfofromUrl(index,url):
paper, thisYear = GetPaperBaseInfoFromUrlAll(url)
return (index,url,paper,thisYear)
if __name__ == "__main__":
url = 'http://dblp.uni-trier.de/db/journals/talg/'
urls,descr = zip(*GetVolumeUrlfromUrl(url))
results = []
with ThreadPoolExecutor(max_workers=10) as executor:
futs = [executor.submit(GetPaperInfofromUrl, index,url) for index,url in enumerate(urls)]
for future in concurrent.futures.as_completed(futs):
results.append(future.result())
GetPaperInfofromUrl
seems superfluous, you could probably refactor GetPaperBaseInfoFromUrlAll
and avoid a function call.
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.