[英]Python multiprocessing.Process can not stop when after connecting the network
多线程爬取论文信息时,获取信息后无法关闭进程:
error当我注释掉function是从.network获取信息的代码时,这些进程可以正常结束。 normal这个错误很困扰我,我不知道,我的.network connect 是通过requests
和set response.close()
有没有帅哥或者美女帮帮这个迷茫的人? 谢谢
这是完整的代码:我的 python 是 python 3.7
from multiprocessing import Process, Queue, Pool,Manager,Value
import time, random
import requests
import re
from bs4 import BeautifulSoup
headers = {
'user-agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.129 Safari/537.36,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36",
'Connection': 'close'
}
## Just get the html text
def GetUrlInfo(url):
response = requests.get(url=url, headers=headers)
response.encoding = 'utf-8'
response.close()
SoupData = BeautifulSoup(response.text, 'lxml')
return SoupData
def GetVolumeUrlfromUrl(url:str)->str:
"""input is Journal's url and output is a link and a text description to each issue of the journal"""
url = re.sub('http:', 'https:', url)
SoupDataTemp = GetUrlInfo(url+'index.html')
SoupData = SoupDataTemp.find_all('li')
UrlALL = []
for i in SoupData:
if i.find('a') != None:
volumeUrlRule = '<a href=\"(.*?)\">(.*?)</a>'
volumeUrlTemp = re.findall(volumeUrlRule,str(i),re.I)
# u = i.find('a')['href']
# # print(u)
for u in volumeUrlTemp:
if re.findall(url, u[0]):
# print(u)
UrlALL.append((u[0], u[1]), )
# print(UrlALL)
return UrlALL
def GetPaperBaseInfoFromUrlAll(url:str)->str:
"""The input is the url and the output is all the paper information obtained from the web page,
including, doi, title, author, and the date about this volume """
soup = GetUrlInfo(url)
temp1 = soup.find_all('li',class_='entry article')
temp2= soup.find_all('h2')
temp2=re.sub('\\n',' ',temp2[1].text)
# print(temp2)
volumeYear = re.split(' ',temp2)[-1]
paper = []
for i in temp1:
if i.find('div',class_='head').find('a')== None:
paperDoi = ''
else:
paperDoi = i.find('div',class_='head').find('a')['href']
title = i.find('cite').find('span',class_='title').text[:-2]
paper.append([paperDoi,title])
return paper,volumeYear
# test start
url = 'http://dblp.uni-trier.de/db/journals/talg/'
UrlALL = GetVolumeUrlfromUrl(url)
UrlLen = len(UrlALL)
# put the url into the query
def Write(query,value,num):
for count in range(num):
query.put(value[count][0],True)
# time.sleep(random.random())
print('write end')
# from the query get the url and get the paper info with this url
def Read(query,num,PaperInfo1,COUNT,i,paperNumber):
while True:
count = COUNT.get(True)
# print("before enter" + str(i) + ' - ' + str(count)+' - '+str(num))
COUNT.put(count, True)
if not query.empty():
value = query.get(True)
count = COUNT.get(True)
count = count + 1
COUNT.put(count,True)
paper, thisYear = GetPaperBaseInfoFromUrlAll(value) # just commented
print("connected " + str(i) + ' - ' + str(count) + ' - ' + str(num))
numb = paperNumber.get(True)
numb = numb + len(paper)
paperNumber.put(numb) # just commented
# print(paper,thisYear)
PaperInfo1.put((paper,thisYear),) # just commented
print("the process "+str(i)+' - '+ str(count)+ ' : '+value)
if not COUNT.empty():
count = COUNT.get(True)
# print("after enter" + str(i) + ' - ' + str(count) + ' - ' + str(num))
COUNT.put(count,True)
if int(count) == int(num):
print("the process "+str(i)+" end ")
break
print('read end')
# print the paper info
def GetPaperInfo(PaperInfo1,paperNumber):
for i in range(paperNumber.get(True)):
value = PaperInfo1.get(True)
print(value)
if __name__=='__main__':
r_num = 10 # th read process number
w_num = 1 # th write process number
w_cnt = UrlLen # the write counter
q = Queue(UrlLen) # the volune url queue
paperNumber = Queue(1) # the all paper number
COUNT = Queue(1) # the end tag
COUNT.put(int(0)) # first is zero
paperNumber.put(int(0)) # first is zero
PaperInfo1 = Queue()
r_list = [Process( target=Read, args=(q,w_cnt,PaperInfo1,COUNT,i,paperNumber) ) for i in range(r_num)]
w_list = [Process( target=Write, args=(q,UrlALL,w_cnt) )]
time_start = time.time()
[task.start() for task in w_list]
[task.start() for task in r_list]
[task.join() for task in w_list]
[task.join() for task in r_list]
time_used = time.time() - time_start
GetPaperInfo(PaperInfo1, paperNumber)
print('time_used:{}s'.format(time_used))
我不知道,调试过程最终进入process.py
-> row:297: try: self.run()
然后进入 row:300: util._exit_function()
并连接调试但我不知道为什么.network 会导致这个错误以及如何解决这个问题就这些了谢谢!
嗨,又是我,我尝试了线程的并发实现,线程的global
变量比进程队列数据共享舒服多了。 通过线程它确实实现了但我的主 function 无法停止,以前使用进程在并发获取时无法继续下一步fetching of data was implemented through threads and continued in the main function
但the main function can't be stopped anymore
。 多么有趣!
我设计了三个与前面类似的功能。
GetUrlintoQueue
是将抓取到的url 的UrlALL
到队列UrlQueue
中, UrlLen
是url 的编号。
import threading
import queue
count = 0 # Record the number of times a value is fetched from the queue
paperNumber = 0 # Record the number of papers
def GetUrlintoQueue(UrlQueue,UrlALL,UrlLen):
for index in range(UrlLen):
UrlQueue.put(UrlALL[index][0], True)
print('Write End')
UrlQueue.task_done()
另一个是GetPaperInfofromUrl
。 从 UrlQueue 中获取UrlQueue
并将对应页面的信息写入PaperInfo
, index
为线程号。
def GetPaperInfofromUrl(UrlQueue,PaperInfo,index,UrlLen):
global count,paperNumber
while True:
if not UrlQueue.empty():
url = UrlQueue.get(True)
count = count + 1
paper, thisYear = GetPaperBaseInfoFromUrlAll(url) # just commented
print("connected " + str(index) + '-nd - ' + str(count) + ' - ' + str(UrlLen))
print(paper,thisYear)
paperNumber = paperNumber + len(paper)
PaperInfo.put((paper, thisYear), True)
if count == UrlLen:
print("the process " + str(index) + " end ")
break
UrlQueue.task_done()
PaperInfo.task_done()
print('the process ' + str(index) +' get paper info end')
GetPaperInfo
是显示PaperInfo
的结果,它不会改变。
def GetPaperInfo(PaperInfo,paperNumber):
for i in range(paperNumber):
value = PaperInfo.get(True)
print(value)
main function先设置好相应的变量,然后直接先写入,然后10个线程爬取论文信息,最后显示结果,但是显示结果后还是can not exit
,我也想不明白为什么。
if __name__ == '__main__':
url = 'http://dblp.uni-trier.de/db/journals/talg/'
UrlALL = GetVolumeUrlfromUrl(url)
UrlLen = len(UrlALL)
UrlQueue = queue.Queue(UrlLen)
PaperInfo = queue.Queue(1000)
WriteThread = 1
ReadThread = 10
# url write
GetUrlThread = [threading.Thread(target=GetUrlintoQueue, args=(UrlQueue,UrlALL,UrlLen,))]
time_start = time.time()
[geturl.start() for geturl in GetUrlThread]
[geturl.join() for geturl in GetUrlThread]
time_used = time.time() - time_start
print('time_used:{}s'.format(time_used))
# url write end
# paperinfo get
PaperinfoGetThread = [threading.Thread(target=GetPaperInfofromUrl, args=(UrlQueue,PaperInfo,index,UrlLen,)) for index in range(ReadThread)]
time_start = time.time()
[getpaper.start() for getpaper in PaperinfoGetThread]
[getpaper.join() for getpaper in PaperinfoGetThread]
time_used = time.time() - time_start
print('time_used:{}s'.format(time_used))
# paperinfo get end
GetPaperInfo(PaperInfo,paperNumber) # show the results
import sys # it does not work
sys.exit()
调试显示: debug.gif (我没有 10 个信誉所以图片是链接类型。)
以下是您的进程使用 concurrent.futures 来管理所有线程和数据传输的样子。 (未测试)改编文档中的示例。
from concurrent.futures import ThreadPoolExecutor
def GetPaperInfofromUrl(index,url):
paper, thisYear = GetPaperBaseInfoFromUrlAll(url)
return (index,url,paper,thisYear)
if __name__ == "__main__":
url = 'http://dblp.uni-trier.de/db/journals/talg/'
urls,descr = zip(*GetVolumeUrlfromUrl(url))
results = []
with ThreadPoolExecutor(max_workers=10) as executor:
futs = [executor.submit(GetPaperInfofromUrl, index,url) for index,url in enumerate(urls)]
for future in concurrent.futures.as_completed(futs):
results.append(future.result())
GetPaperInfofromUrl
似乎是多余的,您可以重构GetPaperBaseInfoFromUrlAll
并避免调用 function。
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.