[英]Parsing file with ElementTree and BeautifulSoup: is there a way to parse the file by number of tag levels?
我有这个 xml文件,我基本上想将所有信息记录到字典中。
我写了这段代码:
import requests
import xml.etree.ElementTree as ET
import urllib2
import glob
import pprint
from bs4 import BeautifulSoup
#get the XML file
#response = requests.get('https://www.drugbank.ca/drugs/DB01048.xml')
#with open('output.txt', 'w') as input:
# input.write(response.content)
#set up lists etc
set_of_files = glob.glob('output*txt')
val = lambda x: "{http://www.drugbank.ca}" + str(x)
key_list = ['drugbank-id','name','description','cas-number','unii','average-mass','monoisotopic-mass','state','indication','pharmacodynamics','mechanism-of-action','toxicity','metabolism','absorption','half-life','protein-binding','route-of-elimination','volume-of-distribution','fda-label','msds']
key_dict = {}
method1 = ['groups','synonyms','patent']
method3_inputs = [('dosages','dosage'),('salts','salt'),('products','product'),('mixtures','mixture'),('packagers','packager'),('categories','category'),('atc-codes','atc-code'),('pdb-entries','pdb-entry'),('food-interactions','food-interaction'),('drug-interactions','drug-interaction'),('properties','property'),('external-identifiers','external-identifier'),('external-links','external-link'),('reactions','reaction')]
list_to_run_thru = ['description','direct-parent','kingdom','superclass','class','subclass']
alternative_parents = []
http_add = '{http://www.drugbank.ca}'
substituents = []
ap_sub = lambda x:'{http://www.drugbank.ca}'+ x
def method2(list2_name,list3_name,list4_name):
temp_list = []
for i in subnode:
if i.tag == list2_name:
for a in i:
if a.tag == list3_name:
for u in a:
if u.tag == list4_name:
temp_list.append(u.text)
return temp_list
def method3(list1_name):
list_of_tuples = []
for i in subnode:
if i.tag == list1_name:
temp_list = []
for a in i:
temp_list.append(a.text)
list_of_tuples.append(temp_list)
return list_of_tuples
new_method3 = []
for i in method3_inputs:
new_k = http_add + i[0]
new_v = http_add + i[1]
new_method3.append((new_k,new_v))
for each_file in set_of_files:
tree = ET.parse(each_file)
root = tree.getroot()
for i in key_list:
for child in root.getchildren():
if i not in key_dict:
key_dict[i] = [child.find(val(i)).text.encode('utf-8')]
else:
key_dict[i].append(child.find(val(i)).text.encode('utf-8'))
for node in root:
for subnode in node:
if subnode.tag == '{http://www.drugbank.ca}general-references':
if 'pubmed-id' not in key_dict:
key_dict['pubmed-id'] = method2('{http://www.drugbank.ca}articles','{http://www.drugbank.ca}article','{http://www.drugbank.ca}pubmed-id')
else:
key_dict['pubmed-id'].append(method2('{http://www.drugbank.ca}articles','{http://www.drugbank.ca}article','{http://www.drugbank.ca}pubmed-id'))
for a,b in zip(method3_inputs,new_method3):
if subnode.tag == b[0]:
if a[0] not in key_dict:
key_dict[a[0]] = method3(b[1])
else:
key_dict[method3_inputs].append(method3(b[1]))
if subnode.tag == '{http://www.drugbank.ca}classification':
for each_item in list_to_run_thru:
for i in subnode:
if i.tag == ap_sub(each_item):
if i.tag == '{http://www.drugbank.ca}alternative-parent':
alternative_parents.append(i.text)
if i.tag == '{http://www.drugbank.ca}substituent':
substituents.append(i.text)
for i in method1:
if subnode.tag == '{http://www.drugbank.ca}' + i:
for n in subnode:
if i not in key_dict:
key_dict[i] = [n.text]
elif i in key_dict:
key_dict[i].append(n.text)
if subnode.tag == '{http://www.drugbank.ca}pathways':
if 'pathways'not in key_dict:
key_dict['pathways'] = method2('{http://www.drugbank.ca}pathways','{http://www.drugbank.ca}pathway','{http://www.drugbank.ca}drug')
else:
key_dict['pathways'].append(method2('{http://www.drugbank.ca}pathways','{http://www.drugbank.ca}pathway','{http://www.drugbank.ca}drug'))
key_dict['alternative_parents'] = alternative_parents
key_dict['substituent'] = substituents
html = requests.get('https://www.drugbank.ca/drugs/DB01048').text
soup = BeautifulSoup(html, 'html.parser')
div_targets = soup.find('div', class_='bond-list-container targets')
targets = div_targets.find_all('div', class_='bond card')
for target in targets:
k = []
v = []
for property in target.find_all('dt'):
k.append(property.get_text())
for property in target.find_all('dd'):
v.append(property.get_text())
key_dict[target.find('strong').get_text()] = dict(zip(k, v))
print key_dict.keys()
代码在描述得很好的XML文件上运行。 但是有一些问题我想看看是否有人可以改进:
可以看到,我已经为两层和三层编写了一个函数(函数分别称为method3和method2),然后将其硬编码在属于每个函数的列表中。
我想知道是否有人可以向我展示比这更好/更干净/更有效的代码。 我的想法是,我有一个ID列表(例如,描述,产品),并且该函数以某种方式理解“这是一个三层标签(例如,products-> product-> name),因此我不必费劲代码在所有3层标签,所有2层标签等中进行编码。类似if语句'if标签为3层深...执行此功能.... if标签为2层深.. ..做这个'。
免责声明:我知道此方法同时使用BeautifulSoup和ET进行解析,因为我被困在一节中并在此处获得了帮助。 不幸的是,我需要精通一个,然后才能继续,而且我发现HTML版本比XML更加混乱,因此这就是脚本从XML和HTML角度解析文件的原因。
是否有人对如何使脚本更简洁有任何一般性评论? 这个想法是将每个“顶级”标签(即紧接在“毒品类型”标签下方的所有内容)都放入标签,然后将所有这些信息基本上下拉到词典/列表/任何内容中,然后在我想搜索的任何时候某些情况下,我可以在词典中搜索“最高级”字词,并且它具有该标签的子数据,我可以阅读该标签(这些级别的组织方式正确,例如,如果标签下方只有一个级别,则只有一个string是可以的,但是如果标签下有很多标签,则可能返回字典/列表/元组/更合适的东西)。
编辑:基于以下帮助,我下载了drugbank.xsd并运行了此命令:
pyxbgen -m DB01048 -u drugbank.xsd --no-validate-changes
然后这个脚本:
from __future__ import print_function
import DB01048
xml = open('DB01048.xml').read()
d_01048 = DB01048.CreateFromDocument(xml)
#print(d_01048.drug[0].state)
#print(d_01048.drug[0].name)
#print(d_01048.drug[0].general_references.articles.article[0].pubmed_id)
和错误:
Traceback (most recent call last):
File "parse_drug_db.py", line 5, in <module>
d_01048 = DB01048.CreateFromDocument(xml)
File "/home/drugs/DB01048.py", line 65, in CreateFromDocument
saxer.parse(io.BytesIO(xmld))
File "/usr/lib/python2.7/xml/sax/expatreader.py", line 110, in parse
xmlreader.IncrementalParser.parse(self, source)
File "/usr/lib/python2.7/xml/sax/xmlreader.py", line 123, in parse
self.feed(buffer)
File "/usr/lib/python2.7/xml/sax/expatreader.py", line 213, in feed
self._parser.Parse(data, isFinal)
File "/usr/lib/python2.7/xml/sax/expatreader.py", line 365, in end_element_ns
self._cont_handler.endElementNS(pair, None)
File "/home/aoidoh/bin/local/lib/python2.7/site-packages/pyxb/binding/saxer.py", line 388, in endElementNS
binding_object = this_state.endBindingElement()
File "/home/aoidoh/bin/local/lib/python2.7/site-packages/pyxb/binding/saxer.py", line 245, in endBindingElement
return self.__bindingInstance._postDOMValidate()
File "/home/aoidoh/bin/local/lib/python2.7/site-packages/pyxb/binding/basis.py", line 2652, in _postDOMValidate
self._validateAttributes()
File "/home/aoidoh/bin/local/lib/python2.7/site-packages/pyxb/binding/basis.py", line 2246, in _validateAttributes
au.validate(self)
File "/home/aoidoh/bin/local/lib/python2.7/site-packages/pyxb/binding/content.py", line 251, in validate
raise pyxb.MissingAttributeError(type(ctd_instance), self.__name, ctd_instance)
pyxb.exceptions_.MissingAttributeError: Instance of <class 'DB01048.drugbank_type'> lacks required attribute exported-on
一种更pythonic的方法可能是使用PyXB包来生成从xml文件解组的python对象。
pip install PyXB
pyxbgen -m DB01048 -u drugbank-plus.xsd --no-validate-changes
使用生成的模块作为
from __future__ import print_function import DB01048 xml = open('DB01048.xml').read() d_01048 = DB01048.CreateFromDocument(xml) print(d_01048.drug[0].state) print(d_01048.drug[0].name) print(d_01048.drug[0].general_references.articles.article[0].pubmed_id)
solid Abacavir 17356469
如果出现与某些exported-on
属性相关的MissingAttribute错误,请在DB01048.py的第一个Class之前添加以下行
pyxb.RequireValidWhenParsing(True) pyxb.RequireValidWhenParsing(False)
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.