简体   繁体   中英

Parsing file with ElementTree and BeautifulSoup: is there a way to parse the file by number of tag levels?

I have this xml file, and I basically want to record all of the information into a dictionary.

I wrote this code:

import requests
import xml.etree.ElementTree as ET
import urllib2
import glob
import pprint
from bs4 import BeautifulSoup

#get the XML file
#response = requests.get('https://www.drugbank.ca/drugs/DB01048.xml')
#with open('output.txt', 'w') as input:
#          input.write(response.content)


#set up lists etc
set_of_files = glob.glob('output*txt')
val = lambda x: "{http://www.drugbank.ca}" + str(x)
key_list = ['drugbank-id','name','description','cas-number','unii','average-mass','monoisotopic-mass','state','indication','pharmacodynamics','mechanism-of-action','toxicity','metabolism','absorption','half-life','protein-binding','route-of-elimination','volume-of-distribution','fda-label','msds']
key_dict = {}
method1 = ['groups','synonyms','patent']
method3_inputs = [('dosages','dosage'),('salts','salt'),('products','product'),('mixtures','mixture'),('packagers','packager'),('categories','category'),('atc-codes','atc-code'),('pdb-entries','pdb-entry'),('food-interactions','food-interaction'),('drug-interactions','drug-interaction'),('properties','property'),('external-identifiers','external-identifier'),('external-links','external-link'),('reactions','reaction')]
list_to_run_thru = ['description','direct-parent','kingdom','superclass','class','subclass']
alternative_parents = []
http_add = '{http://www.drugbank.ca}'
substituents = []
ap_sub = lambda x:'{http://www.drugbank.ca}'+ x

def method2(list2_name,list3_name,list4_name):
     temp_list = []
     for i in subnode:
          if i.tag == list2_name:
               for a in i:
                    if a.tag == list3_name:
                         for u in a:
                              if u.tag == list4_name:
                                   temp_list.append(u.text)
     return temp_list

def method3(list1_name):
     list_of_tuples = []
     for i in subnode:
          if i.tag == list1_name:
               temp_list = []
               for a in i:
                    temp_list.append(a.text)
                    list_of_tuples.append(temp_list)
     return list_of_tuples

new_method3 = []
for i in method3_inputs:
     new_k = http_add + i[0]
     new_v = http_add + i[1]
     new_method3.append((new_k,new_v))

for each_file in set_of_files:
     tree = ET.parse(each_file)
     root = tree.getroot()
     for i in key_list:
          for child in root.getchildren():
               if i not in key_dict:
                    key_dict[i] = [child.find(val(i)).text.encode('utf-8')]
               else:
                    key_dict[i].append(child.find(val(i)).text.encode('utf-8'))

     for node in root:
          for subnode in node:
               if subnode.tag == '{http://www.drugbank.ca}general-references':
                    if 'pubmed-id' not in key_dict:
                         key_dict['pubmed-id'] = method2('{http://www.drugbank.ca}articles','{http://www.drugbank.ca}article','{http://www.drugbank.ca}pubmed-id')
                    else:
                         key_dict['pubmed-id'].append(method2('{http://www.drugbank.ca}articles','{http://www.drugbank.ca}article','{http://www.drugbank.ca}pubmed-id'))

               for a,b in zip(method3_inputs,new_method3):
                    if subnode.tag == b[0]:
                         if a[0] not in key_dict:
                              key_dict[a[0]] = method3(b[1])
                         else:
                              key_dict[method3_inputs].append(method3(b[1]))

               if subnode.tag == '{http://www.drugbank.ca}classification': 
                    for each_item in list_to_run_thru:
                         for i in subnode:
                               if i.tag == ap_sub(each_item):
                                   if i.tag == '{http://www.drugbank.ca}alternative-parent':
                                         alternative_parents.append(i.text)  
                                   if i.tag == '{http://www.drugbank.ca}substituent':
                                         substituents.append(i.text)  

               for i in method1:
                    if subnode.tag == '{http://www.drugbank.ca}' + i:
                         for n in subnode:
                              if i not in key_dict:
                                   key_dict[i] = [n.text]
                              elif i in key_dict:
                                   key_dict[i].append(n.text)

               if subnode.tag == '{http://www.drugbank.ca}pathways':
                    if 'pathways'not in key_dict:
                         key_dict['pathways'] = method2('{http://www.drugbank.ca}pathways','{http://www.drugbank.ca}pathway','{http://www.drugbank.ca}drug')
                    else:
                         key_dict['pathways'].append(method2('{http://www.drugbank.ca}pathways','{http://www.drugbank.ca}pathway','{http://www.drugbank.ca}drug'))


key_dict['alternative_parents'] = alternative_parents
key_dict['substituent'] = substituents


html = requests.get('https://www.drugbank.ca/drugs/DB01048').text
soup = BeautifulSoup(html, 'html.parser')
div_targets = soup.find('div', class_='bond-list-container targets')
targets = div_targets.find_all('div', class_='bond card')

for target in targets:
    k = []
    v = []
    for property in target.find_all('dt'):
        k.append(property.get_text())
    for property in target.find_all('dd'):
        v.append(property.get_text())
    key_dict[target.find('strong').get_text()] = dict(zip(k, v))

print key_dict.keys()

The code runs on the XML file described fine. But there are some issues I would like to see if anyone can improve:

  1. You can clearly see I had to hard-code a lot of it. For example, some of the tags are one-layer deep (eg drugbank-id), some are two layers deep (eg groups -> group), some are three layers deep (eg products, product,name) etc.

You can see that I've written a function for the two and three layers deep (the functions are called method3 and method2), and then I've hardcoded in lists that belong to each function.

I'm wondering can anyone show me code that is better/cleaner/more efficient than this. My idea would be that I have a list of IDs (eg description, products) and somehow the function understands 'this is a 3-layer tag (eg products -> product ->name), so then I don't have to hard code in all the 3-layer tags, all the 2-layer tags etc. something like having an if statement 'if tags are 3-layers deep...do this function....if tags are 2-layers deep....do this'.

  1. Disclaimer: I know this method uses both BeautifulSoup and ET to parse, as I was stuck on a section and got help here . Unfortunately I need to master one before I can move on, and I found the HTML version much more confusing than the XML, for now, so that's why the script parses the file from both a XML and HTML perspective.

  2. Does anyone have any general comments on how this script can be made cleaner? The idea would be to take each of the 'top-level' tags (ie everything immediately below the 'drug type' tag) and then basically pull down all that info into dictionaries/lists/whatever, and then whenever I want to search for something, I can just search the dictionary for the 'top level' word and it has sub-data for that tag that I can read through (and the levels are organised properly, eg if there is only one level below the tag, just a string is fine, but if there is a lot of tags below the tag, maybe returning a dictionary/list/tuple/something more appropriate).

Edit: Based on the below help, I downloaded drugbank.xsd and ran this command:

pyxbgen -m DB01048 -u drugbank.xsd --no-validate-changes

and then this script:

from __future__ import print_function
import DB01048

xml = open('DB01048.xml').read()
d_01048 = DB01048.CreateFromDocument(xml)

#print(d_01048.drug[0].state)
#print(d_01048.drug[0].name)
#print(d_01048.drug[0].general_references.articles.article[0].pubmed_id)

and the error:

Traceback (most recent call last):
  File "parse_drug_db.py", line 5, in <module>
    d_01048 = DB01048.CreateFromDocument(xml)
  File "/home/drugs/DB01048.py", line 65, in CreateFromDocument
    saxer.parse(io.BytesIO(xmld))
  File "/usr/lib/python2.7/xml/sax/expatreader.py", line 110, in parse
    xmlreader.IncrementalParser.parse(self, source)
  File "/usr/lib/python2.7/xml/sax/xmlreader.py", line 123, in parse
    self.feed(buffer)
  File "/usr/lib/python2.7/xml/sax/expatreader.py", line 213, in feed
    self._parser.Parse(data, isFinal)
  File "/usr/lib/python2.7/xml/sax/expatreader.py", line 365, in end_element_ns
    self._cont_handler.endElementNS(pair, None)
  File "/home/aoidoh/bin/local/lib/python2.7/site-packages/pyxb/binding/saxer.py", line 388, in endElementNS
    binding_object = this_state.endBindingElement()
  File "/home/aoidoh/bin/local/lib/python2.7/site-packages/pyxb/binding/saxer.py", line 245, in endBindingElement
    return self.__bindingInstance._postDOMValidate()
  File "/home/aoidoh/bin/local/lib/python2.7/site-packages/pyxb/binding/basis.py", line 2652, in _postDOMValidate
    self._validateAttributes()
  File "/home/aoidoh/bin/local/lib/python2.7/site-packages/pyxb/binding/basis.py", line 2246, in _validateAttributes
    au.validate(self)
  File "/home/aoidoh/bin/local/lib/python2.7/site-packages/pyxb/binding/content.py", line 251, in validate
    raise pyxb.MissingAttributeError(type(ctd_instance), self.__name, ctd_instance)
pyxb.exceptions_.MissingAttributeError: Instance of <class 'DB01048.drugbank_type'> lacks required attribute exported-on

A more pythonic method could be to use the PyXB package to generate a python object unmarshalled from the xml file.

  • Install PyXB package

pip install PyXB

  • Create python module from downloaded xsd. Will create DB01048.py

pyxbgen -m DB01048 -u drugbank-plus.xsd --no-validate-changes

  • Use the generated module as

     from __future__ import print_function import DB01048 xml = open('DB01048.xml').read() d_01048 = DB01048.CreateFromDocument(xml) print(d_01048.drug[0].state) print(d_01048.drug[0].name) print(d_01048.drug[0].general_references.articles.article[0].pubmed_id) 

solid Abacavir 17356469

If a MissingAttribute error appears related to some exported-on attribute, add the following lines before the first Class on DB01048.py

pyxb.RequireValidWhenParsing(True) pyxb.RequireValidWhenParsing(False)

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM