简体   繁体   中英

Parse a txt file and store data into a dictionary

I have a set of data that I would like to extract from a txt file and stored in a specific format. The data is is currently in a txt file like so:

set firewall family inet filter INBOUND term TEST from source-address 1.1.1.1/32
set firewall family inet filter INBOUND term TEST from destination-prefix-list test-list
set firewall family inet filter INBOUND term TEST from protocol udp
set firewall family inet filter INBOUND term TEST from destination-port 53
set firewall family inet filter INBOUND term TEST then accept
set firewall family inet filter PROD term LAN from source-address 4.4.4.4/32
set firewall family inet filter PROD term LAN from source-address 5.5.5.5/32
set firewall family inet filter PROD term LAN from protocol tcp
set firewall family inet filter PROD term LAN from destination-port 443
set firewall family inet filter PROD term LAN then deny

I would like the data to be structured to where each rule has their respective options placed into dictionary and appended to a list. For example:

Expected Output

[{'Filter': 'INBOUND', 'Term': 'TEST', 'SourceIP': '1.1.1.1/32', 'DestinationList': 'test-list', 'Protocol': 'udp', 'DestinationPort': '53', 'Action': 'accept},
{'Filter': 'PROD', 'Term': 'LAN', 'SourceIP': ['4.4.4.4/32','5.5.5.5/32'], 'Protocol': 'tcp', 'DestinationPort': '443', 'Action': 'deny'}]

As you can see there may be instances where a certain trait does not exist for a rule. I would also have to add multiple IP addresses as a value. I am currently using Regex to match the items in the txt file. My thought was to iterate through each line in the file, find any matches and add them as a key-value pair to a dictionary.

Once I get an "accept" or "deny", that should signal the end of the rule and I will append the dictionary to the list, clear the dictionary and start the process with the next rule. However this does not seem to be working as intended. My Regex seems fine but I cant seem to figure out the logic when processing each line, adding multiple values to a value list, and adding values to the dictionary. Here is my code below

import re

data_file = "sample_data.txt"

##### REGEX PATTERNS #####

filter_re = r'(?<=filter\s)(.*)(?=\sterm.)'
term_re = r'(?<=term\s)(.*)(?=\sfrom|\sthen)'
protocol_re = r'(?<=protocol\s)(.*)'
dest_port_re = r'(?<=destination-port\s)(.*)'
source_port_re = r'(?<=from\ssource-port\s)(.*)'
prefix_source_re = r'(?<=from\ssource-prefix-list\s)(.*)'
prefix_dest_re = r'(?<=from\sdestination-prefix-list\s)(.*)'
source_addr_re = r'(?<=source-address\s)(.*)'
dest_addr_re = r'(?<=destination-address\s)(.*)'
action_re = r'(?<=then\s)(deny|accept)'

pattern_list = [filter_re, term_re, source_addr_re, prefix_source_re, source_port_re, dest_addr_re, prefix_dest_re, dest_port_re, protocol_re, action_re]

pattern_headers = ["Filter", "Term", "Source_Address", "Source_Prefix_List", "Source_Port", "Destination_Address," "Destination_Prefix_List", "Destination_Port", "Protocol", "Action"]

final_list = []

def open_file(file):
    rule_dict = {}
    with open(file, 'r') as f:
        line = f.readline()
        while line:
            line = f.readline().strip()
            for header, pattern in zip(pattern_headers,pattern_list):
                match = re.findall(pattern, line)
                if len(match) != 0:
                    if header != 'accept' or header != 'deny':
                        rule_dict[header] = match[0]
                    else:
                        rule_dict[header] = match[0]
                        final.append(rule_dict)
                        rule_dict = {}
    print(rule_dict)
    print(final_list)

The final list is empty and the rule_dict only contains the final rule from the text file not the both of the rulesets. Any guidance would be greatly appreciated.

There are few little mistakes in your code:

  • in your while loop f.readline() needs to be at the end, otherwise you already begin in line 2 (readline called twice before doing anything)
  • final_list has to be defined in your function and also used correctly then (instead of only "final"
  • if header != 'accept' or header != 'deny': : here needs to be an and . One of them is always True, so the else part never gets executed.
  • you need to check the match for accept|deny , not the header
  • for example in Source_IP you want to have a list with all IP's you find. The way you do it, the value would always be updated and only the last found IP will be in your final_list
def open_file(file):
    final_list = []
    rule_dict = {}
    with open(file) as f:
        line = f.readline()

        while line:
            line = line.strip()
            for header, pattern in zip(pattern_headers, pattern_list):
                match = re.findall(pattern, line)
                if len(match) != 0:                  
                    if (match[0] != "accept") and (match[0] != "deny"):
                        rule_dict.setdefault(header, set()).add(match[0])
                    else:
                        rule_dict.setdefault(header, set()).add(match[0])

                        #adjust values of dict to list (if multiple values) or just a value (instead of set) before appending to list
                        final_list.append({k:(list(v) if len(v)>1 else v.pop()) for k,v in rule_dict.items()})
                        rule_dict = {}
            line = f.readline()
        
    print(f"{rule_dict=}")
    print(f"{final_list=}")
    
open_file(data_file)

Output:

rule_dict={}
final_list=[
    {
        'Filter': 'INBOUND', 
         'Term': 'TEST', 
         'Source_Address': '1.1.1.1/32', 
         'Destination_Prefix_List': 'test-list', 
         'Protocol': 'udp', 'Destination_Port': '53', 
         'Action': 'accept'
    }, 
    {
        'Filter': 'PROD', 
         'Term': 'LAN', 
         'Source_Address': ['5.5.5.5/32', '4.4.4.4/32'], 
         'Protocol': 'tcp', 
         'Destination_Port': '443', 
         'Action': 'deny'
    }
]

There are few things that i have change in your code:

  • When " accept " and " deny " found in action then append final_dict in final_list and empty final_dict
  • allow to add more than one SourceIP - for that create list in value of SourceIP when more than SourceIP get

import re
data_file = "/home/hiraltalsaniya/Documents/Hiral/test"

filter_re = r'(?<=filter\s)(.*)(?=\sterm.)'
term_re = r'(?<=term\s)(.*)(?=\sfrom|\sthen)'
protocol_re = r'(?<=protocol\s)(.*)'
dest_port_re = r'(?<=destination-port\s)(.*)'
source_port_re = r'(?<=from\ssource-port\s)(.*)'
prefix_source_re = r'(?<=from\ssource-prefix-list\s)(.*)'
prefix_dest_re = r'(?<=from\sdestination-prefix-list\s)(.*)'
source_addr_re = r'(?<=source-address\s)(.*)'
dest_addr_re = r'(?<=destination-address\s)(.*)'
action_re = r'(?<=then\s)(deny|accept)'

pattern_list = [filter_re, term_re, source_addr_re, prefix_source_re, source_port_re, dest_addr_re, prefix_dest_re,
                dest_port_re, protocol_re, action_re]

pattern_headers = ["Filter", "Term", "SourceIP", "Source_Prefix_List", "Source_Port", "Destination_Address",
                   "DestinationList", "Destination_Port", "Protocol", "Action"]

def open_file(file):
    final_dict: dict = dict()
    final_list: list = list()
    with open(file) as f:
        for line in f:
            for header, pattern in zip(pattern_headers, pattern_list):
                match = re.search(pattern, line)
                if match:
                    # check if accept or deny  it means the end of the rule then empty dictionary
                    if str(match.group()) == "accept" or match.group() == "deny":
                        final_list.append(final_dict)
                        final_dict: dict = dict()
                    # if more than one SourceIP then create list of SourceIP
                    elif header == "SourceIP" and header in final_dict.keys():
                        final_dict[header] = [final_dict[header]]
                        final_dict.setdefault(header, final_dict[header]).append(match.group())
                    else:
                        final_dict[header] = match.group()
    print("final_list=", final_list)
open_file(data_file)

Output:

final_list= [{'Filter': 'INBOUND', 
              'Term': 'TEST', 
              'SourceIP': '1.1.1.1/32', 
              'DestinationList': 'test-list', 
              'Protocol': 'udp', 
              'Destination_Port': '53'
            }, 
            {'Filter': 'PROD', 
             'Term': 'LAN', 
             'SourceIP': ['4.4.4.4/32', '5.5.5.5/32'], 
             'Protocol': 'tcp', 
             'Destination_Port': '443'
            }]

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM