程序内存泄漏（Pandas Dataframe）

Question

我想我已经将程序内存问题缩小到了熊猫数据框。 每个循环中，ram使用量增加约300-800kb。 短期内这无关紧要，但是该程序使用stubhub API来获取老鹰游戏的入场券，因此我想不停地运行它直到游戏发生。 这是不可能的，因为该过程在几个小时内就使用了我所有的系统内存。

我为此在stubhub上创建了一个一次性API帐户，因此不必担心。

#LIBS
import requests
import base64
import json
import pandas as pd
import datetime
from time import sleep
import gc
#SETTINGS
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.options.mode.chained_assignment = None
lprice = 1
row = 250
start = 0
check = 0
#USER INPUT
pro_url = 'https://pro.stubhub.com/simweb/sim/services/priceanalysis?eventId=103577414&sectionId=0'
eventid = pro_url.replace("https://pro.stubhub.com/simweb/sim/services/priceanalysis?eventId=", "").replace("&sectionId=0", "")
lprice = int(input('By default enter 1, if prices are coming back incorrect, press 2: '))

#API TOKENS && REQUESTS
app_token = '77de9c22-1799-3f30-8a6e-546c4abd9afd'
consumer_key = 'fSYdVsJFHSxn1hf2Z5Ubv5KULaka'
consumer_secret = '5Deehc9tWoN2AMSwpdVMpdmLWqwa'
stubhub_username = 'ejmoncrief@gmail.com'
stubhub_password = 'st^acerfl#owt12345!'
combo = consumer_key + ':' + consumer_secret
basic_authorization_token = base64.b64encode(combo.encode('utf-8'))
headers = {
        'Content-Type':'application/x-www-form-urlencoded',
        'Authorization':'Basic '+basic_authorization_token.decode('utf-8'),}
body = {
        'grant_type':'password',
        'username':stubhub_username,
        'password':stubhub_password,
        'scope':'PRODUCTION'}
url = 'https://api.stubhub.com/login'
r = requests.post(url, headers=headers, data=body)
token_respoonse = r.json()
access_token = token_respoonse['access_token']
user_GUID = r.headers['X-StubHub-User-GUID']
inventory_url = 'https://api.stubhub.com/search/inventory/v2'
headers['Authorization'] = 'Bearer ' + access_token
headers['Accept'] = 'application/json'
headers['Accept-Encoding'] = 'application/json'
#MAKE REQUEST
def game_req():
    global row
    global start
    global check
    data = {'eventid':eventid, 'rows':row, 'start': start}
    inventory = requests.get(inventory_url, headers=headers, params=data)
    #print(inventory) #PRINT REQUEST RESPONSE
    inv = inventory.json()
    start = inv['start']
    total_listings = inv['totalListings']
    try: #SEE IF ANY DATA, IF NOT RESTART REQ
        listing_df = pd.DataFrame(inv['listing'])
    except:
        game_req()
    listing_df['amount'] = listing_df.apply(lambda x: x['currentPrice']['amount'], axis=1)

    #DROP TABLES, IF NOT EXISTS THEN PASS
    if lprice == 1:
        try:
            listing_df.drop('currentPrice', axis=1, inplace=True)
        except:
            pass
    else:
        try:
            listing_df.drop('listingPrice', axis=1, inplace=True)
        except:
            pass
    try:
        listing_df.drop('amount', axis=1, inplace=True)
    except:
        pass
    try:
        listing_df.drop('businessGuid', axis=1, inplace=True)
    except:
        pass
    try:
        listing_df.drop('deliveryMethodList', axis=1, inplace=True)
    except:
        pass
    try:
        listing_df.drop('deliveryTypeList', axis=1, inplace=True)
    except:
        pass
    try:
        listing_df.drop('dirtyTicketInd', axis=1, inplace=True)
    except:
        pass
    try:
        listing_df.drop('faceValue', axis=1, inplace=True)
    except:
        pass
    try:
        listing_df.drop('isGA', axis=1, inplace=True)
    except:
        pass
    try:
        listing_df.drop('listingAttributeCategoryList', axis=1, inplace=True)
    except:
        pass
    try:
        listing_df.drop('listingAttributeList', axis=1, inplace=True)
    except:
        pass
    try:
        listing_df.drop('score', axis=1, inplace=True)
    except:
        pass
    try:
        listing_df.drop('sellerOwnInd', axis=1, inplace=True)
    except:
        pass
    try:
        listing_df.drop('zoneId', axis=1, inplace=True)
    except:
        pass
    try:
        listing_df.drop('ticketSplit', axis=1, inplace=True)
    except:
        pass
    try:
        listing_df.drop('splitVector', axis=1, inplace=True)
    except:
        pass
    try:
        listing_df.drop('splitOption', axis=1, inplace=True)
    except:
        pass
    try:
        listing_df.drop('sellerSectionName', axis=1, inplace=True)
    except:
        pass
    try:
        listing_df.drop('seatNumbers', axis=1, inplace=True)
    except:
        pass
    try:
        listing_df.drop('listingId', axis=1, inplace=True)
    except:
        pass
    try:
        listing_df.drop('sectionId', axis=1, inplace=True)
    except:
        pass
    try:
        listing_df.drop('zoneName', axis=1, inplace=True)
    except:
        pass
    #CHECK TICKETS
    d = listing_df.to_dict(orient='records') #pd df to dict
    a = listing_df.values.tolist() #dict to list of lists
    for i in a:
        with open(eventid+'.txt', 'a+') as y:
            with open(eventid+'.txt', 'r') as z:
                if str(i)+'\n' in z:
                    pass
                else:
                    y.write(str(i)+'\n')
                    head = ['Price', 'Qty', 'Row', 'Section']
                    D=dict.fromkeys(head)
                    D.update(zip(head,i))
                    D = str(D)
                    D = D.replace("{", '').replace("}", '').replace("{'amount': ", '').replace("'currency': 'USD'}, ", '').replace("'", '').replace("amount: ", '').replace(", currency: USD", '').replace(",", ' | ')
                    print(D)

    y.close()
    z.close()
    gc.collect()
    check +=1
    print('Checked Listings '+str(check)+' Times | Last Check At: '+str(datetime.datetime.now()))
    print('Total Listings: '+str(total_listings))
    sleep(10)
    while start < total_listings:
        if start >(total_listings-250):
            start += total_listings-start
        else:
            start+=250
            row = total_listings-start
            game_req()
    else:
        start = 0
        game_req()


game_req()

Answer 1

递归与循环不同。 当您从函数内部调用函数时，Python解释器不知道您已经完成了原始调用中的变量，因此它将保留这些变量，直到最终取消调用栈。

IOW：

import os
import time
import psutil  # third-party package, you'd need to install it

def no_wasted_memory():
    while True:
        x = list(range(1000))
        print(psutil.Process(os.getpid()).memory_info().rss)
        time.sleep(1)

def lots_of_wasted_memory():
    x = list(range(1000))
    print(psutil.Process(os.getpid()).memory_info().rss)
    time.sleep(1)
    lots_of_wasted_memory()

给我

In [101]: no_wasted_memory()
108367872
108367872
108367872
108367872
108367872

和

In [103]: lots_of_wasted_memory()
109080576
109105152
109137920
109166592
109195264
[...]

如果您想无限期地进行操作，或者至少要等到达到条件时，请使用while循环。

除了＃1：除非需要，否则不要使用全局变量。 诚实地传递变量。

除了＃2：不要使用裸露的例外，它们会隐藏错误。

除了＃3：不要重复自己。 所有这些下降线可以减少到

df = df.drop(columns_to_drop, axis=1, errors='ignore')

毫不奇怪，这里columns_to_drop是要删除的列名的列表。

程序内存泄漏（Pandas Dataframe）

问题描述

1 个解决方案

解决方案1
2 2018-09-03 22:23:31

程序内存泄漏（Pandas Dataframe）

问题描述

1 个解决方案

解决方案1 2 2018-09-03 22:23:31

解决方案1
2 2018-09-03 22:23:31