[英]Run Python Script on Subdirectories
我有一個帶有子目錄的父目錄,每個子目錄都包含一個 .html 文件,我想在該文件上運行我的代碼。 這需要一個 html 文件,並將導出帶有表數據的相應 csv 文件。
我嘗試了兩種主要方法,但都不能正常工作,因為它無法相應地找到 .html 文件(不存在)。 注意:子目錄中每個文件的名稱將始終為 index.html
for file in */; do for file in *.html; do python html_csv2.py "$file"; done; done
name = 'index.html'
html = utils.getFileContent(name)
#Get data from file
doc = SimplifiedDoc(html)
soup = bs(html, 'lxml')
title = (soup.select_one('title').text)
title = title.split(' -')
strain = title[0]
rows = []
tables = doc.selects('table.region-table')
tables = tables[:-1]
#print (type(tables))
for table in tables:
trs = table.tbody.trs
for tr in trs:
rows.append([td.text for td in tr.tds])
#print(rows)
#print(type(rows))
#print("PANDAS DATAFRAME")
df_rows = pd.DataFrame(rows)
df_rows.columns = ['Region', 'Class', 'From', 'To', 'Associated Product', 'Class', 'Similarity']
df_rows['Strain'] = strain
df_rows = df_rows[['Strain','Region', 'Class', 'From', 'To', 'Associated Product', 'Class', 'Similarity']]
#print(df_rows)
df_rows.to_csv (r'antismash_html.csv', index = False, header=True)
print('CSV CREATED')
在第二個片段中,我嘗試使用 os 庫相應地進入每個子目錄。
import csv
from simplified_scrapy import SimplifiedDoc,req,utils
import sys
import pandas as pd
import lxml.html
from bs4 import BeautifulSoup as bs
import os
name = 'index.html'
html = utils.getFileContent(name)
# Get data from file
doc = SimplifiedDoc(html)
soup = bs(html, 'lxml')
cwd = os.getcwd()
print(cwd)
directory_to_check = cwd # Which directory do you want to start with?
def directory_function(directory):
print("Listing: " + directory)
print("\t-" + "\n\t-".join(os.listdir("."))) # List current working directory
# Get all the subdirectories of directory_to_check recursively and store them in a list:
directories = [os.path.abspath(x[0]) for x in os.walk(directory_to_check)]
directories.remove(os.path.abspath(directory_to_check)) #Dont' want it done in my main directory
def csv_create(name):
title = (soup.select_one('title').text)
title = title.split(' -')
strain = title[0]
rows = []
tables = doc.selects('table.region-table')
tables = tables[:-1]
#print (type(tables))
for table in tables:
trs = table.tbody.trs
for tr in trs:
rows.append([td.text for td in tr.tds])
#print(rows)
#print(type(rows))
#print("PANDAS DATAFRAME")
df_rows = pd.DataFrame(rows)
df_rows.columns = ['Region', 'Class', 'From', 'To', 'Associated Product', 'Class', 'Similarity']
df_rows['Strain'] = strain
df_rows = df_rows[['Strain','Region', 'Class', 'From', 'To', 'Associated Product', 'Class', 'Similarity']]
#print(df_rows)
df_rows.to_csv (r'antismash_html.csv', index = False, header=True)
print('CSV CREATED')
#with open(name +'.csv','w',encoding='utf-8') as f:
# csv_writer = csv.writer(f)
# csv_writer.writerows(rows)
for i in directories:
os.chdir(i) # Change working Directory
csv_create(name) # Run your function
directory_function
#csv_create(name)
我嘗試使用此處的示例: Python:在所有子目錄中運行腳本,但無法相應地執行。
或者,您可以考慮使用glob.glob()
。 但是要小心地通過在 glob 表達式中指定您的路徑來從您打算搜索的文件夾中進行搜索 - 或者 cd 到文件夾中。
glob 會給你一個簡單的相對路徑列表。
>>> import glob
>>>
>>> files = glob.glob('**/*.py', recursive=True)
>>> len(files)
3177
>>> files[0]
'_wxWidgets-3.0.2/build/bakefiles/wxwin.py'
>>>
Doc 在這里有一些 glob 表達式示例: https : //docs.python.org/3.5/library/glob.html
如果您從具有許多嵌套子文件夾的文件夾開始對驅動器進行遞歸搜索,則它會使解釋器陷入困境,直到它完成 - 或者您終止會話。
嘗試這個。
import os
from simplified_scrapy import utils
def getSubDir(name,end=None):
filelist = os.listdir(name)
if end:
filelist = [os.path.join(name,l) for l in filelist if l.endsWith(end)]
return filelist
subDir = getSubDir('./') # The directory which you want to start with
for dir in subDir:
# files = getSubDir(dir,end='index.html')
fileName = dir+'/index.html'
if not os.path.isfile(fileName): continue
html = utils.getFileContent(fileName)
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.