简体   繁体   中英

How to bypass IndexError

Here is my situation: My code parses out data from HTML tables that are within emails. The roadblock I'm running into is that some of these tables have blank empty rows right in the middle of the table, as seen in the photo below. This blank space causes my code to fail ( IndexError: list index out of range ) since it attempts to extract text from the cells.

Is it possible to say to Python: "ok, if you run into this error that comes from these blank rows, just stop there and take the rows you have acquired text from so far and execute the rest of the code on those"...?

That might sound like a dumb solution to this problem but my project involves me taking data from only the most recent date in the table anyway, which is always amongst the first few rows, and always before these blank empty rows.

So if it is possible to say "if you hit this error, just ignore it and proceed" then I would like to learn how to do that. If it's not then I'll have to figure out another way around this. Thanks for any and all help.

The table with the gap: 在此处输入图片说明

My code:

from bs4 import BeautifulSoup, NavigableString, Tag
import pandas as pd
import numpy as np
import os
import re
import email
import cx_Oracle

dsnStr = cx_Oracle.makedsn("sole.nefsc.noaa.gov", "1526", "sole")
con = cx_Oracle.connect(user="user", password="password", dsn=dsnStr)

def celltext(cell):
    '''    
        textlist=[]
        for br in cell.findAll('br'):
            next = br.nextSibling
            if not (next and isinstance(next,NavigableString)):
                continue
            next2 = next.nextSibling
            if next2 and isinstance(next2,Tag) and next2.name == 'br':
                text = str(next).strip()
                if text:
                    textlist.append(next)
        return (textlist)
    '''
    textlist=[]
    y = cell.find('span')
    for a in y.childGenerator(): 
        if isinstance(a, NavigableString):
            textlist.append(str(a))
    return (textlist)

path = 'Z:\\blub_2'

for filename in os.listdir(path):
    file_path = os.path.join(path, filename)
    if os.path.isfile(file_path):
        html=open(file_path,'r').read()
        soup = BeautifulSoup(html, 'lxml') # Parse the HTML as a string
        table = soup.find_all('table')[1] # Grab the second table

df_Quota = pd.DataFrame()

for row in table.find_all('tr'):    
    columns = row.find_all('td')
    if columns[0].get_text().strip()!='ID':  # skip header 
        Quota = celltext(columns[1]) 
        Weight =  celltext(columns[2])
        price =  celltext(columns[3])

        print(Quota)

        Nrows= max([len(Quota),len(Weight),len(price)]) #get the max number of rows

        IDList = [columns[0].get_text()] * Nrows
        DateList = [columns[4].get_text()] * Nrows

        if price[0].strip()=='Package':
             price = [columns[3].get_text()] * Nrows

        if len(Quota)<len(Weight):#if Quota has less itmes extend with NaN
           lstnans= [np.nan]*(len(Weight)-len(Quota))
           Quota.extend(lstnans)

        if len(price) < len(Quota): #if price column has less items than quota column,
            val = [columns[3].get_text()] * (len(Quota)-len(price)) #extend with 
            price.extend(val)                                       #whatever is in
                                                                    #price column

        #if len(DateList) > len(Quota): #if DateList is longer than Quota, 
            #print("it's longer than")
            #value = [columns[4].get_text()] * (len(DateList)-len(Quota))
            #DateList = value * Nrows

        if len(Quota) < len(DateList): #if Quota is less than DateList (due to gap),
            stu = [np.nan]*(len(DateList)-len(Quota))   #extend with NaN
            Quota.extend(stu)

        if len(Weight) < len(DateList):
            dru = [np.nan]*(len(DateList)-len(Weight))
            Weight.extend(dru)

        FinalDataframe = pd.DataFrame(
        {
        'ID':IDList,    
         'AvailableQuota': Quota,
         'LiveWeightPounds': Weight,
         'price':price,
         'DatePosted':DateList
        })

        df_Quota = df_Quota.append(FinalDataframe, ignore_index=True)
        #df_Quota = df_Quota.loc[df_Quota['DatePosted']=='5/20']
        df_Q = df_Quota['DatePosted'].iloc[0]
        df_Quota = df_Quota[df_Quota['DatePosted'] == df_Q]
print (df_Quota)

for filename in os.listdir(path):
    file_path = os.path.join(path, filename)
    if os.path.isfile(file_path):
        with open(file_path, 'r') as f:
            pattern = re.compile(r'Sent:.*?\b(\d{4})\b')
            email = f.read()
            dates = pattern.findall(email)
            if dates:
                print("Date:", ''.join(dates))

#cursor = con.cursor()
#exported_data = [tuple(x) for x in df_Quota.values]
#sql_query = ("INSERT INTO ROUGHTABLE(species, date_posted, stock_id, pounds, money, sector_name, ask)" "VALUES (:1, :2, :3, :4, :5, 'NEFS 2', '1')")
#cursor.executemany(sql_query, exported_data)
#con.commit()

#cursor.close()
#con.close()

Use try: ... except: ... :

try:
    #extract data from table
except IndexError:
    #execute rest of program

continue is the keyword to use for skipping empty/problem rows. IndexError is thanks to the attempt to access columns[0] on an empty columns list. So just skip to next row when there is an exception.

for row in table.find_all('tr'):
    columns = row.find_all('td')
    try:
      if columns[0].get_text().strip()!='ID':
        # Rest as above in original code.
    except IndexError:
      continue

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM