Python：脚本中的内存问题

Question

I have wrote a script where i read around 4 million of points and 800.000 plots. 我写了一个脚本，我读了大约400万点和800.000块。 The script clip the points inside each plot and save a new text file for each plot. 该脚本会剪切每个绘图中的点，并为每个绘图保存一个新的文本文件。

After a certain period of time my PC memory is full. 经过一段时间后，我的电脑内存已满。 I had tried to dig inside my script but in each loop for i in xrange(len(sr)): the each object is replaced and the points clipped saved in a new txt file. 我曾尝试在我的脚本中挖掘，但是在for i in xrange(len(sr)):中的每个循环for i in xrange(len(sr)):更换每个对象并将剪切的点保存在新的txt文件中。

are there some strategy to use in this case in order to improve memory usage without reduce the performance(the script is already slow)? 在这种情况下是否有一些策略用于在不降低性能的情况下提高内存使用率（脚本已经很慢）？ I am a beginner in python and sorry if the question is simple. 我是python的初学者，如果问题很简单，我很抱歉。

Thanks in advance Gianni 在此先感谢Gianni

inFile ="C://04-las_clip_inside_area//prova//Ku_115_class_Notground_normalize.las"
poly ="C://04-las_clip_inside_area//prova//ku_115_plot_clip.shp"
chunkSize = None
MinPoints = 1

sf = shapefile.Reader(poly) #open shpfile
sr = sf.shapeRecords()
poly_filename, ext = path.splitext(poly)
inFile_filename = os.path.splitext(os.path.basename(inFile))[0]
pbar = ProgressBar(len(sr)) # set progressbar
if chunkSize == None:
    points = [(p.x,p.y) for p in lasfile.File(inFile,None,'r')]
    for i in xrange(len(sr)):
        pbar.update(i+1) # progressbar
        verts = np.array(sr[i].shape.points,float)
        record = sr[i].record[0]
        index = nonzero(points_inside_poly(points, verts))[0]
        if len(index) >= MinPoints:
            file_out = open("{0}_{1}_{2}.txt".format(poly_filename, inFile_filename, record), "w")
            inside_points = [lasfile.File(inFile,None,'r')[l] for l in index]
            for p in inside_points:
                file_out.write("%s %s %s %s %s %s %s %s %s %s %s" % (p.x, p.y, p.z, p.intensity,p.return_number,p.number_of_returns,p.scan_direction,p.flightline_edge,p.classification,p.scan_angle,record)+ "\n")
            file_out.close()

this is the origial function 这是原始功能

def LAS2TXTClipSplitbyChunk(inFile,poly,chunkSize=1,MinPoints=1):
    sf = shapefile.Reader(poly) #open shpfile
    sr = sf.shapeRecords()
    poly_filename, ext = path.splitext(poly)
    inFile_filename = os.path.splitext(os.path.basename(inFile))[0]
    pbar = ProgressBar(len(sr)) # set progressbar
    if chunkSize == None:
        points = [(p.x,p.y) for p in lasfile.File(inFile,None,'r')]
        for i in xrange(len(sr)):
            pbar.update(i+1) # progressbar
            verts = np.array(sr[i].shape.points,float)
            record = sr[i].record[0]
            index = nonzero(points_inside_poly(points, verts))[0]
            if len(index) >= MinPoints:
                file_out = open("{0}_{1}_{2}.txt".format(poly_filename, inFile_filename, record), "w")
                inside_points = [lasfile.File(inFile,None,'r')[l] for l in index]
                for p in inside_points:
                    file_out.write("%s %s %s %s %s %s %s %s %s %s %s" % (p.x, p.y, p.z, p.intensity,p.return_number,p.number_of_returns,p.scan_direction,p.flightline_edge,p.classification,p.scan_angle,record)+ "\n")
                file_out.close()
    else:
        for i in xrange(len(sr)):
            pbar.update(i+1) # progressbar
            verts = np.array(sr[i].shape.points,float)
            record = sr[i].record[0]
            f = lasfile.File(inFile,None,'r')
            file_out = open("{0}_{1}_{2}.txt".format(poly_filename, inFile_filename, record), "w")
            TotPoints = 0
            while True:
                chunk = list(islice(f,chunkSize))
                if not chunk:
                    break
                points = [(p.x,p.y) for p in chunk]
                index = nonzero(points_inside_poly(points, verts))[0]
                TotPoints += len(index) #add points to count inside th plot
                chunk = [chunk[l] for l in index]
                for p in chunk:
                    file_out.write("%s %s %s %s %s %s %s %s %s %s %s" % (p.x, p.y, p.z, p.intensity,p.return_number,p.number_of_returns,p.scan_direction,p.flightline_edge,p.classification,p.scan_angle,record)+ "\n")
            if TotPoints >= MinPoints:
                file_out.close()
            else:
                file_out.close()
                os.remove("{0}_{1}_{2}.txt".format(poly_filename, inFile_filename, record))
            f.close()

the script by the suggestion of unutbu is: unutbu建议的脚本是：

import shapefile
import os
import glob
from os import path
import numpy as np
from numpy import nonzero
from matplotlib.nxutils import points_inside_poly
from itertools import islice
from liblas import file as lasfile
from shapely.geometry import Polygon
from progressbar import ProgressBar
import multiprocessing as mp


inFile ="C://04-las_clip_inside_area//prova//Ku_115_class_Notground_normalize.las"
poly ="C://04-las_clip_inside_area//prova//ku_115_plot_clip.shp"
chunkSize = None
MinPoints = 1

def pointinside(record):
    verts = np.array(record.shape.points, float)
    record = record.record[0]
    index = nonzero(points_inside_poly(points, verts))[0]
    if len(index) >= MinPoints:
        outfile = "{0}_{1}_{2}.txt".format(poly_filename, inFile_filename, record)
        with open(outfile, "w") as file_out:
            inside_points = [lasfile.File(inFile, None, 'r')[l] for l in index]
            for p in inside_points:
                fields = (p.x, p.y, p.z, p.intensity, p.return_number,
                          p.number_of_returns, p.scan_direction, p.flightline_edge,
                          p.classification, p.scan_angle, record)
                file_out.write(' '.join(map(str, fields)) + "\n")

sf = shapefile.Reader(poly) #open shpfile
sr = sf.shapeRecords()
poly_filename, ext = path.splitext(poly)
inFile_filename = os.path.splitext(os.path.basename(inFile))[0]
pbar = ProgressBar(len(sr)) # set progressbar
if chunkSize == None:
    points = [(p.x,p.y) for p in lasfile.File(inFile,None,'r')]
    for i in xrange(len(sr)):
        pbar.update(i+1) # progressbar
        proc = mp.Process(target = pointinside, args = (sr[i], ))
        proc.start()
        proc.join()

Answer 1

The only reliable way to free memory used for a temporary computation is to run that computation in a subprocess. 释放用于临时计算的内存的唯一可靠方法是在子进程中运行该计算。 When the subprocess ends, the memory will be freed. 当子进程结束时，将释放内存。

If you move the code in the outer loop into a function (let's call it work ), then you can run work in a subprocess using the multiprocessing module: 如果您在外环的代码移动到一个函数（让我们称之为work ），那么你就可以运行work使用的子multiprocessing模块：

import sys
import os
import time
import itertools
import multiprocessing as mp
import numpy as np
import matplotlib.nxutils as nx
import liblas
import shapefile

clock = time.clock if sys.platform == 'win32' else time.time

def LAS2TXTClipSplitbyChunk(inFile, poly, chunkSize = 1, MinPoints = 1):
    sf = shapefile.Reader(poly) #open shpfile
    sr = sf.shapeRecords()
    poly_filename, ext = os.path.splitext(poly)
    for record in sr:
        inFile_filename = os.path.splitext(os.path.basename(inFile))[0]
        record_num = record.record[0]        
        out_filename = '{0}_{1}_{2}.txt'.format(
            poly_filename, inFile_filename, record_num)
        pool.apply_async(pointinside,
                         args = (record, out_filename, inFile, chunkSize, MinPoints),
                         callback = update)

def pointinside(record, out_filename, inFile, chunkSize, MinPoints):
    start = clock()
    record_num = record.record[0]   
    verts = np.array(record.shape.points, float)
    f = iter(liblas.file.File(inFile, None, 'rb'))
    result = []
    worth_writing = False
    for chunk in iter(lambda: list(itertools.islice(f, chunkSize)), []):
        points = [(p.x, p.y) for p in chunk]
        index = nx.points_inside_poly(points, verts)
        chunk = [p for inside, p in itertools.izip(index,chunk) if inside]
        for p in chunk:
            fields = (p.x, p.y, p.z, p.intensity, p.return_number,
                      p.number_of_returns, p.scan_direction, p.flightline_edge,
                      p.classification, p.scan_angle, record_num)
            result.append(' '.join(map(str, fields)))
        if len(result) >= bufferSize:
            # Writing to disk is slow. Doing it once for every iteration is
            # inefficient.  So instead build up bufferSize number of lines
            # before writing them all to disk.
            worth_writing = True
            with open(out_filename, 'a') as file_out:
                file_out.write('\n'.join(result)+'\n')
            result = []
    # In case there were some results (less than bufferSize lines), we
    # dump them to disk here.
    if (len(result) >= MinPoints) or worth_writing:
        with open(out_filename, 'a') as file_out:
            file_out.write('\n'.join(result)+'\n')
    f.close()                    
    end = clock()
    return end-start

def update(result):
    with open(debug_filename, 'a') as f:
        f.write('{r}\n'.format(r = result))

if __name__ == '__main__':
    workdir = 'C://04-las_clip_inside_area//prova//'
    # workdir = os.path.expanduser('~/tmp/tmp')
    os.chdir(workdir)
    inFile = 'Ku_115_class_Notground_normalize.las'
    poly = 'ku_115_plot_clip.shp'
    debug_filename = 'debug.dat'
    chunkSize = None
    MinPoints = 1
    bufferSize = max(MinPoints, 100)

    pool = mp.Pool()
    LAS2TXTClipSplitbyChunk(inFile, poly, chunkSize, MinPoints)
    pool.close()
    pool.join()

Here is a plot of the times each task is taking to complete: 以下是每项任务完成所需时间的图表：

In [129]: import matplotlib.pyplot as plt

In [130]: import numpy as np

In [131]: x = np.genfromtxt('debug.dat')

In [132]: plt.plot(x)
Out[132]: [<matplotlib.lines.Line2D object at 0xe309b4c>]

In [133]: plt.show()

在此输入图像描述

I'm not seeing any progressive slow-down. 我没有看到任何渐进的减速。 Perhaps give this code a try. 也许试试这个代码吧。

Python：脚本中的内存问题

问题描述

1 个解决方案

解决方案1
4 已采纳

Python：脚本中的内存问题

问题描述

1 个解决方案

解决方案1 4 已采纳

解决方案1
4 已采纳