[英]Python sys.argv[1] index out of range
我有以下代碼從一堆PDF文件中提取一些數據。 它適用於幾個文件,然后給了我索引超出范圍的錯誤。
__author__ = 'thavan'
import os
import sys
import convertor
def _get_file_list(root):
"""
Get all the pdf files in a given directory.
:param root:
:return:
"""
file_list = []
for root, dir, files in os.walk(root):
if len(files) > 0:
for f in files:
file_path = os.path.join(root, f)
if file_path.endswith('.pdf'):
file_list.append(file_path)
return file_list
def _match_key(key, match_list):
for match in match_list:
if match == key:
return True
return False
class SedaScraper(object):
process_only = [] # Add here any PDF file with full path if you want to process only these files.
def __init__(self):
"""
Update this list whenever need to fetch a new value.
:return:
"""
self.total_spend_key = ['Total Spend', 'Total spend']
self.total_spend_all_media_key = ['Total spend All Media']
self.outlet_per_all_media_key = ['Press % All Media', 'Internet % All Media', 'Outdoor % All Media',
'TV % All Media', 'Cinema % All Media']
self.no_of_new_create_key = ['No of New Banners', 'No. of New Creatives']
def _get_csv_values(self, pdf_file):
"""Extracts values from a given PDF file.
"""
pdf_text = convertor.convert(pdf_file).get_text()
lines = pdf_text.split('\n')
data = []
sub_data = []
for line in lines:
if line.strip() != '':
sub_data.append(line.strip())
else:
data.append(sub_data)
sub_data = []
outlet = data[0][0]
company_name = data[1][0]
date = data[2][0]
start_date = date.split(' to ')[0]
end_date = date.split(' to ')[1]
for x in range(3, len(data)):
try:
if _match_key(data[x][0], self.no_of_new_create_key):
metric_data = dict(zip(data[x], data[x+1]))
break
except IndexError:
print "Some required text not found. Please check following data... {}".format(data)
sys.exit(1)
total_spend, total_spend_all_media, outlet_per_all_media, no_of_new_creatives = self._parse_metric(metric_data)
print company_name, outlet, start_date, end_date, total_spend, total_spend_all_media, outlet_per_all_media, no_of_new_creatives
# change below CSV separator as required.
return '|'.join((company_name, outlet, start_date, end_date, total_spend, total_spend_all_media, outlet_per_all_media, no_of_new_creatives))
def _parse_metric(self, metric_data):
total_spend = None
total_spend_all_media = None
outlet_per_all_media = None
no_of_new_creatives = None
for key, value in metric_data.items():
if _match_key(key, self.total_spend_key):
total_spend = value
elif _match_key(key, self.total_spend_all_media_key):
total_spend_all_media = value
elif _match_key(key, self.outlet_per_all_media_key):
outlet_per_all_media = value
elif _match_key(key, self.no_of_new_create_key):
no_of_new_creatives = value
return total_spend, total_spend_all_media, outlet_per_all_media, no_of_new_creatives
def process(self, root):
"""Iteratively goes through every PDF file.
:param root:
:return:
"""
pdf_list = _get_file_list(root)
out_file = open(os.path.join(root, 'output.csv'), 'w')
if self.process_only:
pdf_list = self.process_only
for pdf in pdf_list:
print "Processing", pdf
csv_line = self._get_csv_values(pdf)
out_file.write(csv_line + '\n')
print "Output file: {}".format(out_file.name)
if __name__ == '__main__':
if len(sys.argv) == 1:
print "Usage: pdf_scraper.py <path>"
sys.exit(1)
SedaScraper().process(sys.argv[1])
這是我得到的錯誤:
C:\Users\soz\Documents\Python\seda_pdf\src>python pdf_scraper.py C:\Users\soz\Do
cuments\Python\seda_pdf\2
Processing C:\Users\soz\Documents\Python\seda_pdf\2\2009\Internet\01.pdf
Aberdeen Asset Management Internet 01 January 2009 31 January 2009 £5,505 £166
,384 3.31% 5
Processing C:\Users\soz\Documents\Python\seda_pdf\2\2009\Internet\02.pdf
Aberdeen Asset Management Internet 01 February 2009 28 February 2009 £5,906 £2
26,575 2.61% 5
Processing C:\Users\soz\Documents\Python\seda_pdf\2\2009\Internet\03.pdf
Traceback (most recent call last):
File "pdf_scraper.py", line 117, in <module>
SedaScraper().process(sys.argv[1])
File "pdf_scraper.py", line 109, in process
csv_line = self._get_csv_values(pdf)
File "pdf_scraper.py", line 65, in _get_csv_values
end_date = date.split(' to ')[1]
IndexError: list index out of range
我不知道出什么問題了,因為它實際上適用於我擁有的某些PDF文件。 我也檢查了我擁有的文件,它們沒有問題。
我的Python知識很有限,因此,如果您的回答可以證明是假的,我將不勝感激。
end_date = date.split(' to ')[1]
IndexError: list index out of range
如果沒有to
目前的數據變量。 因此不會發生拆分,而是如何獲取拆分零件的索引1。
因此將以上內容更改為
end_date = ""
if ' to ' in date:
end_date = date.split(' to ')[1]
嘗試:
if not ' to ' in date:
// raise exception - validate if string contains your condition
date_chunks = date.split(' to ')
start_date = date_chunks [0]
end_date = date_chunks [1]
在Python中,就像在Java中一樣,我們從零開始索引,而不是從1開始
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.