简体   繁体   中英

Program Memory Leak ( Pandas Dataframe )

I think I have narrowed down my programs memory issue to my pandas dataframe. Every loop the ram usage increase by about 300-800kb. This doesn't matter short term but this program uses the stubhub API to get tickets for an eagles game, so I'd like to run it non stop until the game happens. Which is impossible as within a couple hours the process uses all my systems ram.

I made a throwaway api account on stubhub for this so no worries.

#LIBS
import requests
import base64
import json
import pandas as pd
import datetime
from time import sleep
import gc
#SETTINGS
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.options.mode.chained_assignment = None
lprice = 1
row = 250
start = 0
check = 0
#USER INPUT
pro_url = 'https://pro.stubhub.com/simweb/sim/services/priceanalysis?eventId=103577414&sectionId=0'
eventid = pro_url.replace("https://pro.stubhub.com/simweb/sim/services/priceanalysis?eventId=", "").replace("&sectionId=0", "")
lprice = int(input('By default enter 1, if prices are coming back incorrect, press 2: '))

#API TOKENS && REQUESTS
app_token = '77de9c22-1799-3f30-8a6e-546c4abd9afd'
consumer_key = 'fSYdVsJFHSxn1hf2Z5Ubv5KULaka'
consumer_secret = '5Deehc9tWoN2AMSwpdVMpdmLWqwa'
stubhub_username = 'ejmoncrief@gmail.com'
stubhub_password = 'st^acerfl#owt12345!'
combo = consumer_key + ':' + consumer_secret
basic_authorization_token = base64.b64encode(combo.encode('utf-8'))
headers = {
        'Content-Type':'application/x-www-form-urlencoded',
        'Authorization':'Basic '+basic_authorization_token.decode('utf-8'),}
body = {
        'grant_type':'password',
        'username':stubhub_username,
        'password':stubhub_password,
        'scope':'PRODUCTION'}
url = 'https://api.stubhub.com/login'
r = requests.post(url, headers=headers, data=body)
token_respoonse = r.json()
access_token = token_respoonse['access_token']
user_GUID = r.headers['X-StubHub-User-GUID']
inventory_url = 'https://api.stubhub.com/search/inventory/v2'
headers['Authorization'] = 'Bearer ' + access_token
headers['Accept'] = 'application/json'
headers['Accept-Encoding'] = 'application/json'
#MAKE REQUEST
def game_req():
    global row
    global start
    global check
    data = {'eventid':eventid, 'rows':row, 'start': start}
    inventory = requests.get(inventory_url, headers=headers, params=data)
    #print(inventory) #PRINT REQUEST RESPONSE
    inv = inventory.json()
    start = inv['start']
    total_listings = inv['totalListings']
    try: #SEE IF ANY DATA, IF NOT RESTART REQ
        listing_df = pd.DataFrame(inv['listing'])
    except:
        game_req()
    listing_df['amount'] = listing_df.apply(lambda x: x['currentPrice']['amount'], axis=1)

    #DROP TABLES, IF NOT EXISTS THEN PASS
    if lprice == 1:
        try:
            listing_df.drop('currentPrice', axis=1, inplace=True)
        except:
            pass
    else:
        try:
            listing_df.drop('listingPrice', axis=1, inplace=True)
        except:
            pass
    try:
        listing_df.drop('amount', axis=1, inplace=True)
    except:
        pass
    try:
        listing_df.drop('businessGuid', axis=1, inplace=True)
    except:
        pass
    try:
        listing_df.drop('deliveryMethodList', axis=1, inplace=True)
    except:
        pass
    try:
        listing_df.drop('deliveryTypeList', axis=1, inplace=True)
    except:
        pass
    try:
        listing_df.drop('dirtyTicketInd', axis=1, inplace=True)
    except:
        pass
    try:
        listing_df.drop('faceValue', axis=1, inplace=True)
    except:
        pass
    try:
        listing_df.drop('isGA', axis=1, inplace=True)
    except:
        pass
    try:
        listing_df.drop('listingAttributeCategoryList', axis=1, inplace=True)
    except:
        pass
    try:
        listing_df.drop('listingAttributeList', axis=1, inplace=True)
    except:
        pass
    try:
        listing_df.drop('score', axis=1, inplace=True)
    except:
        pass
    try:
        listing_df.drop('sellerOwnInd', axis=1, inplace=True)
    except:
        pass
    try:
        listing_df.drop('zoneId', axis=1, inplace=True)
    except:
        pass
    try:
        listing_df.drop('ticketSplit', axis=1, inplace=True)
    except:
        pass
    try:
        listing_df.drop('splitVector', axis=1, inplace=True)
    except:
        pass
    try:
        listing_df.drop('splitOption', axis=1, inplace=True)
    except:
        pass
    try:
        listing_df.drop('sellerSectionName', axis=1, inplace=True)
    except:
        pass
    try:
        listing_df.drop('seatNumbers', axis=1, inplace=True)
    except:
        pass
    try:
        listing_df.drop('listingId', axis=1, inplace=True)
    except:
        pass
    try:
        listing_df.drop('sectionId', axis=1, inplace=True)
    except:
        pass
    try:
        listing_df.drop('zoneName', axis=1, inplace=True)
    except:
        pass
    #CHECK TICKETS
    d = listing_df.to_dict(orient='records') #pd df to dict
    a = listing_df.values.tolist() #dict to list of lists
    for i in a:
        with open(eventid+'.txt', 'a+') as y:
            with open(eventid+'.txt', 'r') as z:
                if str(i)+'\n' in z:
                    pass
                else:
                    y.write(str(i)+'\n')
                    head = ['Price', 'Qty', 'Row', 'Section']
                    D=dict.fromkeys(head)
                    D.update(zip(head,i))
                    D = str(D)
                    D = D.replace("{", '').replace("}", '').replace("{'amount': ", '').replace("'currency': 'USD'}, ", '').replace("'", '').replace("amount: ", '').replace(", currency: USD", '').replace(",", ' | ')
                    print(D)

    y.close()
    z.close()
    gc.collect()
    check +=1
    print('Checked Listings '+str(check)+' Times | Last Check At: '+str(datetime.datetime.now()))
    print('Total Listings: '+str(total_listings))
    sleep(10)
    while start < total_listings:
        if start >(total_listings-250):
            start += total_listings-start
        else:
            start+=250
            row = total_listings-start
            game_req()
    else:
        start = 0
        game_req()


game_req()

Recursion is not the same as a loop. When you call a function from within a function, the Python interpreter doesn't know that you're done with the variables from the original call, so it preserves them until the call stack is finally unwound.

IOW:

import os
import time
import psutil  # third-party package, you'd need to install it

def no_wasted_memory():
    while True:
        x = list(range(1000))
        print(psutil.Process(os.getpid()).memory_info().rss)
        time.sleep(1)

def lots_of_wasted_memory():
    x = list(range(1000))
    print(psutil.Process(os.getpid()).memory_info().rss)
    time.sleep(1)
    lots_of_wasted_memory()

gives me

In [101]: no_wasted_memory()
108367872
108367872
108367872
108367872
108367872

and

In [103]: lots_of_wasted_memory()
109080576
109105152
109137920
109166592
109195264
[...]

If you want something to go on indefinitely, or at least until a condition is reached, use a while loop.


Aside #1: don't use global variables unless you need to. Pass variables honestly.

Aside #2: don't use bare excepts, they hide errors.

Aside #3: don't repeat yourself. All of those drop lines could be reduced to

df = df.drop(columns_to_drop, axis=1, errors='ignore')

where columns_to_drop is, unsurprisingly, a list of the column names you want to drop.

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM