[英]Python long time script running huge RAM and swap memory consumption
[英]Optimization of python multithreading script - huge memory consumption
我有一个超过800行代码的脚本(Django Management-Command)。 这应该从外部Web服务导入数据,进行操作。 并将其写入Postgres DB。
我使用多线程,因为从Web服务获取数据不是很快。
有一个线程用于使用bulk命令获取数据以获取64个数据集的大量数据,然后将每个数据集写入队列。
在开始的同时,有一个工作线程将数据操纵并将其写入数据库。 在主(句柄)类中,有一个while循环,每隔5秒查找队列中元素的数量和正在运行的工作线程的数量。 如果队列中有500个以上的元素,而工作线程少于5个,它将启动一个新的工作线程。
所有工作线程从队列中获取一项,进行其他操作,将数据集写入DB,并将一个String(最多14个字符)附加到另一个队列(#2)。
队列#2可能需要在导入结束时将所有导入的对象标记为新队列,并分别从数据库中删除当前未导入的所有其他项目。
对于数量不超过200.000数据集的DB,一切正常。 但是,例如,如果有一个具有1.000.000数据集的DB,则在处理漏洞脚本期间,内存消耗会增加,直到8 GB RAM。
有没有一种方法可以监视线程和/或队列的内存消耗? 是否有一种方法可以在每个while循环之后“清除”内存?
# -*- coding: utf-8 -*-
import os
import threading
import Queue
import time
from optparse import OptionParser, make_option
from decimal import Decimal
from datetime import datetime
from django.core.management import call_command
from django.core.management.base import BaseCommand
from django.conf import settings
def is_someone_alive(thread_list):
so_alive = False
for t in thread_list:
if t.is_alive():
so_alive = True
return so_alive
class insert_item(threading.Thread):
VarLock2 = threading.Lock()
def __init__(self, queue1, item_still_exist2, name, *args, **options):
threading.Thread.__init__(self)
self.options = options
self.name = name
self.queue1 = queue1
self.item_still_exist2 = item_still_exist2
def run(self):
while not self.queue1.empty() or getItemBulkThread.isrunning:
item = self.queue1.get()
artikelobj, created = Artikel.objects.get_or_create(artikelnr=item['Nr'])
"""
manipulate data
"""
self.item_still_exist2.put(artikelobj.artikelnr)
artikelobj.save()
self.queue1.task_done()
class getItemBulkThread(threading.Thread):
isrunning = True
VarLock = threading.Lock()
def __init__(self, queue1, name, *args, **options):
threading.Thread.__init__(self)
self.options = options
if self.options['nrStart'] != '':
self.nrab = self.options['nrStart']
else:
self.nrab = ''
self.name = name
#self.nrab = '701307'
self.queue1 = queue1
self.anz_artikel = 64
self.max_artikel = 64
self.skipped = 0
self.max_skip = 20
def run(self):
count_sleep = 0
while True:
while self.queue1.qsize() > 5000:
time.sleep(5)
count_sleep += 1
if count_sleep > 0:
print "~ Artikel-Import %(csleep)sx für 5s pausiert, da Queue-Size > 5000" % {'csleep': count_sleep}
count_sleep = 0
try:
items = getItemBulk() # from external service
except Exception as exc1:
if ('"normal" abort-condition' in str(exc1)):
getItemBulkThread.VarLock.acquire()
getItemBulkThread.isrunning = False
getItemBulkThread.VarLock.release()
break
elif self.anz_artikel > 1:
self.anz_artikel /= 2
continue
elif self.skipped <= self.max_skip:
self.nrab += 1
self.skipped += 1
time.sleep(5)
continue
elif self.skipped > self.max_skip:
raise Exception("[EXCEPTION] Fehler im Thread: too much items skipped")
else:
getItemBulkThread.VarLock.acquire()
getItemBulkThread.isrunning = False
getItemBulkThread.VarLock.release()
raise
last_item = len(items) - 1
self.nrab = items[last_item]['Nr']
for artikel in items:
artikel['katItem'] = False
self.queue1.put(artikel)
if self.anz_artikel < self.max_artikel:
self.anz_artikel *= 2
self.skipped = 0
class Command(BaseCommand):
"""
Django-mgm-command
"""
help = u'Import'
def create_parser(self, prog_name, subcommand):
"""
Create and return the ``OptionParser`` which will be used to
parse the arguments to this command.
"""
return OptionParser(prog=prog_name, usage=self.usage(subcommand),
version=self.get_version(),
option_list=self.option_list,
conflict_handler="resolve")
def handle(self, *args, **options):
startzeit = datetime.now()
anzahl_Artikel_vorher = Artikel.objects.all().count() # Artikel is a model
self.options = options
items_vorher = []
queue1 = Queue.Queue()
item_still_exists2 = Queue.Queue()
running_threads = []
thread = getItemBulkThread(queue1, name="Artikel", *args, **options)
running_threads.append(thread)
thread.daemon = True
thread.start()
anz_worker_threads = 1
anz_max_worker_threads = 5
insert_threads = [insert_item(queue1, item_still_exists2, name="Worker-%(anz)s" % {'anz': i + 1}, *args, **options) for i in range(anz_worker_threads)]
for thread in insert_threads:
running_threads.append(thread)
thread.setDaemon(True)
thread.start()
add_seconds = 5
element_grenze = 500
lastelemente = 0
asc_elemente = 0
anz_abgearbeitet = 0
while getItemBulkThread.isrunning or not queue1.empty():
time.sleep(add_seconds)
elemente = queue1.qsize()
akt_zeit = datetime.now()
diff_zeit = akt_zeit - startzeit
diff = elemente - lastelemente
anz_abgearbeitet = item_still_exists2.qsize()
art_speed = (anz_abgearbeitet / timedelta_total_seconds(diff_zeit)) * 60
ersetz_var = {'anz': elemente, 'zeit': diff_zeit, 'tstamp': akt_zeit.strftime('%Y.%m.%d-%H:%M:%S'), 'anzw': anz_worker_threads, 'diff': diff, 'anza': anz_abgearbeitet, 'art_speed': art_speed}
print("%(zeit)s vergangen - %(tstamp)s - %(anz)s Elemente in Queue, Veränderung: %(diff)s - Anz Worker: %(anzw)s - Artikel importiert: %(anza)s - Speed: %(art_speed)02d Art/Min" % ersetz_var)
if diff > 0:
asc_elemente += 1
else:
asc_elemente = 0
if asc_elemente > 2 and anz_worker_threads < anz_max_worker_threads and elemente > element_grenze:
ersetz_var = {'maxw': anz_max_worker_threads, 'nr': anz_worker_threads + 1, 'element_grenze': element_grenze}
print "~~ 2x in Folge mehr Queue-Elemente als vorher, die max. Anzahl an Workern %(maxw)s noch nicht erreicht und mehr als %(element_grenze)s Elemente in der Queue, daher Start eines neuen Workers (Nr %(nr)s)" % ersetz_var
anz_worker_threads += 1
thread = insert_item(queue1, item_still_exists2, name="Worker-%(anz)s" % {'anz': anz_worker_threads}, *args, **options)
running_threads.append(thread)
thread.setDaemon(True)
thread.start()
asc_elemente = 0
lastelemente = elemente
queue1.join()
items_nachher = []
while not item_still_exists2.empty():
item = item_still_exists2.get()
if item in items_vorher:
items_nachher.append(item)
items_vorher.remove(item)
item_still_exists2.task_done()
item_still_exists2.join()
if len(items_vorher) > 0:
Artikel.objects.filter(artikelnr__in=items_vorher).delete()
anzahl_Artikel_nachher = Artikel.objects.all().count()
anzahl_Artikel_diff = anzahl_Artikel_nachher - anzahl_Artikel_vorher
endzeit = datetime.now()
dauer = endzeit - startzeit
我在某些位置缩写了代码:)
内存消耗过多的可能原因是您没有为输入队列设置最大大小。 请参阅maxsize
参数 。
在相关说明中,您写道:
在主(句柄)类中,有一个while循环,每隔5秒查找队列中元素的数量和正在运行的工作线程的数量。 如果队列中有500个以上的元素,而工作线程少于5个,它将启动一个新的工作线程。
创建新线程并不一定会增加吞吐量。 您应该进行一些测试以确定最佳线程数,该结果可能为1。
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.