简体   繁体   中英

XML parsing with class will not find the values

I will split a large XML to small branches and than parse only this parts. I search modified timestamp "mod_time" tag which is avaliable in "contacts" tag, but my object function call, doesn't find the value. In some contacts is also some tags missing completly.

I tried iterfind('tag_name') , iter() , findall('tag_name') , but my program shows no result and I can't figure out for hours, where my failure is.

Here is my XML reduced to two elements:

<?xml version="1.0" encoding = "utf-8"?>
<phonebooks>
  <phonebook name="Telefonbuch">
   <contact>
      <category>0</category>
      <person>
        <realName>Dummy, Name, Street</realName>
      </person>
      <telephony nid="1">
        <number type="work" prio="1" id="0">012345678</number>
      </telephony>
      <services />
      <setup />
      <features doorphone="0" />
      <mod_time>1587477163</mod_time>
      <uniqueid>358</uniqueid>
    </contact>
    <contact>
      <category>0</category>
      <person>
        <realName>Foto Name</realName>
      </person>
      <telephony nid="1">
        <number type="home" prio="1" id="0">067856743</number>
      </telephony>
      <services />
      <setup />
      <features doorphone="0" />
      <mod_time>1547749691</mod_time>
      <uniqueid>68</uniqueid>
    </contact>
</phonebook>
</phonebooks>

and her what I have done so fare:

import timeit
import xml.etree.ElementTree as ET

class Phonebook:
    def __init__(self, xml_file, tag_node):
        """Split tree in contact branches """
        self.xml_file = xml_file
        self.tag_node = tag_node
        # For furter parsing
        contacts = []
        i = 0
        events =('start','end','start-ns','end-ns')
        for event, elem in ET.iterparse(self.xml_file, events=events):
            if event == 'end' and elem.tag == self.tag_node[0]:
                #print(elem.tag)
                contacts.append(elem)
                par = Contact(elem, i)
                par.parse_node(elem, i)
                i += 1
            elem.clear()
        print("Amount of contacts:", len(contacts))


class Contact:
    def __init__(self, branch, i):
        self.tree = branch
        #print(i, self.tree)
       
    def parse_node(self, branch, i):
        for node in branch.iterfind('.//mod_time'):
           print(node.text)               
         
def main():
    elem = Phonebook('new _dummy1.xml',['contact'])

    
if __name__ == '__main__':
    """ Input XML file definition """
    starttime=timeit.default_timer()
    main()
    print('Finished')
    print("Runtime:", timeit.default_timer()-starttime)

Output: Amount of contacts: 2 Finished Runtime: 0.0006361000050674193

Expected output:

1587477163 1547749691

Code

import timeit
import xml.etree.ElementTree as ET

class Phonebook:
    def __init__(self, xml_file, selector):
        self.xml_file = xml_file
        self.selector = selector
        root = ET.parse(xml_file)
        contacts = root.findall(selector)  
        print("Amount of contacts:", len(contacts))
        for mod_time in contacts:
            print(mod_time.text)

def main():
    Phonebook('./_dummy1.xml','.//contact/mod_time')

if __name__ == '__main__':
    starttime=timeit.default_timer()
    main()
    print('Finished')
    print("Runtime:", timeit.default_timer()-starttime)

Output

$ python test.py
Amount of contacts: 2
1587477163
1547749691
Finished
Runtime: 0.0006627999973716214

I solved now my issue with the handshake of the object data. I post my solution, because it's maybe interessting for others who run in similar issues. Thanks to all who tried to help!

My changed code:

import psutil
import timeit

import xml.etree.ElementTree as ET

class Phonebook:
    def __init__(self, file_path):
        """Split tree in contact branches """
        self.file_path = file_path
    
    def contacts_list(self, file_path):    
        contacts = []
        events =('start','end','start-ns','end-ns')
        for event, elem in ET.iterparse(self.file_path, events=events):
            if event == 'end' and elem.tag == 'contact':
                contact = elem
                contacts.append(contact)
        elem.clear()
        return contacts
        #print("Superclass:",contacts)
        
class Contact(Phonebook):
    def __init__(self, file_path):
        super().__init__(file_path)
               
    def search_node(self, contact, searched_tag):
        contact_template =['category','person', 'telephony', 'services', 'setup', 'features', 'mod_time', 'uniqueid' ]
        node_tag_list = []
        list_difference = []
        search_list = []
        for node in contact:
            if node.tag not in node_tag_list:
                node_tag_list.append(node.tag)
        for element in contact_template:
            if element not in node_tag_list:
                list_difference.append(element)
        
        for node in contact:
            if node.tag == searched_tag and node.tag not in list_difference:
                search_list.append(node.text)
                #print(node.text)
            else:
                if len(list_difference) != 0 and searched_tag in list_difference:
                    message = self.missed_tag(list_difference)
                    #print(message)
                    if message not in search_list:
                        search_list.append(message)                
        return  search_list
                        
    def missed_tag(self, list_difference):
        for m in list_difference:
            message = f'{m} - not assigned'
            return message
                    
         
def main():
    con = Contact('dummy.xml')
    contacts = con.contacts_list(('dummy.xml'))
    
    mod_time_list =[]
    for contact in contacts:
        mod_time = con.search_node(contact, 'mod_time')
        mod_time_list.append(mod_time)
    print(len(mod_time_list))
    print(mod_time_list)
    
if __name__ == '__main__':
    """ Input XML file definition """
    starttime=timeit.default_timer()
    main()
    print('Finished')
    # Getting % usage of virtual_memory ( 3rd field)
    print('RAM memory % used:', psutil.virtual_memory()[2])
    # Getting usage of virtual_memory in GB ( 4th field)
    print('RAM Used (GB):', psutil.virtual_memory()[3]/1000000000)
    print("Runtime:", timeit.default_timer()-starttime)

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM