[英]Parse a txt file and store data into a dictionary
我有一組數據,我想從 txt 文件中提取並以特定格式存儲。 數據當前位於 txt 文件中,如下所示:
set firewall family inet filter INBOUND term TEST from source-address 1.1.1.1/32
set firewall family inet filter INBOUND term TEST from destination-prefix-list test-list
set firewall family inet filter INBOUND term TEST from protocol udp
set firewall family inet filter INBOUND term TEST from destination-port 53
set firewall family inet filter INBOUND term TEST then accept
set firewall family inet filter PROD term LAN from source-address 4.4.4.4/32
set firewall family inet filter PROD term LAN from source-address 5.5.5.5/32
set firewall family inet filter PROD term LAN from protocol tcp
set firewall family inet filter PROD term LAN from destination-port 443
set firewall family inet filter PROD term LAN then deny
我希望將數據結構化為每個規則將其各自的選項放入字典並附加到列表的位置。 例如:
預期產出
[{'Filter': 'INBOUND', 'Term': 'TEST', 'SourceIP': '1.1.1.1/32', 'DestinationList': 'test-list', 'Protocol': 'udp', 'DestinationPort': '53', 'Action': 'accept},
{'Filter': 'PROD', 'Term': 'LAN', 'SourceIP': ['4.4.4.4/32','5.5.5.5/32'], 'Protocol': 'tcp', 'DestinationPort': '443', 'Action': 'deny'}]
如您所見,可能存在某些規則不存在某個特征的情況。 我還必須添加多個 IP 地址作為值。 我目前正在使用正則表達式來匹配 txt 文件中的項目。 我的想法是遍歷文件中的每一行,找到任何匹配項並將它們作為鍵值對添加到字典中。
一旦我得到“接受”或“拒絕”,這應該表示規則結束,我會將字典附加到列表中,清除字典並使用下一條規則開始流程。 然而,這似乎沒有按預期工作。 我的正則表達式看起來不錯,但在處理每一行、將多個值添加到值列表以及將值添加到字典時,我似乎無法弄清楚邏輯。 這是我下面的代碼
import re
data_file = "sample_data.txt"
##### REGEX PATTERNS #####
filter_re = r'(?<=filter\s)(.*)(?=\sterm.)'
term_re = r'(?<=term\s)(.*)(?=\sfrom|\sthen)'
protocol_re = r'(?<=protocol\s)(.*)'
dest_port_re = r'(?<=destination-port\s)(.*)'
source_port_re = r'(?<=from\ssource-port\s)(.*)'
prefix_source_re = r'(?<=from\ssource-prefix-list\s)(.*)'
prefix_dest_re = r'(?<=from\sdestination-prefix-list\s)(.*)'
source_addr_re = r'(?<=source-address\s)(.*)'
dest_addr_re = r'(?<=destination-address\s)(.*)'
action_re = r'(?<=then\s)(deny|accept)'
pattern_list = [filter_re, term_re, source_addr_re, prefix_source_re, source_port_re, dest_addr_re, prefix_dest_re, dest_port_re, protocol_re, action_re]
pattern_headers = ["Filter", "Term", "Source_Address", "Source_Prefix_List", "Source_Port", "Destination_Address," "Destination_Prefix_List", "Destination_Port", "Protocol", "Action"]
final_list = []
def open_file(file):
rule_dict = {}
with open(file, 'r') as f:
line = f.readline()
while line:
line = f.readline().strip()
for header, pattern in zip(pattern_headers,pattern_list):
match = re.findall(pattern, line)
if len(match) != 0:
if header != 'accept' or header != 'deny':
rule_dict[header] = match[0]
else:
rule_dict[header] = match[0]
final.append(rule_dict)
rule_dict = {}
print(rule_dict)
print(final_list)
最終列表為空,並且 rule_dict 僅包含來自文本文件的最終規則,而不是兩個規則集。 任何指導將不勝感激。
您的代碼中有幾個小錯誤:
f.readline()
需要在最后,否則你已經從第 2 行開始(在做任何事情之前調用了兩次 readline)final_list
必須在您的函數中定義並正確使用(而不僅僅是“final”if header != 'accept' or header != 'deny':
: 這里需要一個and
。 其中一個總是 True,所以else
部分永遠不會被執行。accept|deny
的匹配,而不是header
Source_IP
中,您希望有一個包含您找到的所有 IP 的列表。 你這樣做的方式,值總是會更新,只有最后找到的 IP 會在你的final_list
中def open_file(file):
final_list = []
rule_dict = {}
with open(file) as f:
line = f.readline()
while line:
line = line.strip()
for header, pattern in zip(pattern_headers, pattern_list):
match = re.findall(pattern, line)
if len(match) != 0:
if (match[0] != "accept") and (match[0] != "deny"):
rule_dict.setdefault(header, set()).add(match[0])
else:
rule_dict.setdefault(header, set()).add(match[0])
#adjust values of dict to list (if multiple values) or just a value (instead of set) before appending to list
final_list.append({k:(list(v) if len(v)>1 else v.pop()) for k,v in rule_dict.items()})
rule_dict = {}
line = f.readline()
print(f"{rule_dict=}")
print(f"{final_list=}")
open_file(data_file)
輸出:
rule_dict={}
final_list=[
{
'Filter': 'INBOUND',
'Term': 'TEST',
'Source_Address': '1.1.1.1/32',
'Destination_Prefix_List': 'test-list',
'Protocol': 'udp', 'Destination_Port': '53',
'Action': 'accept'
},
{
'Filter': 'PROD',
'Term': 'LAN',
'Source_Address': ['5.5.5.5/32', '4.4.4.4/32'],
'Protocol': 'tcp',
'Destination_Port': '443',
'Action': 'deny'
}
]
我在您的代碼中有幾處更改:
import re
data_file = "/home/hiraltalsaniya/Documents/Hiral/test"
filter_re = r'(?<=filter\s)(.*)(?=\sterm.)'
term_re = r'(?<=term\s)(.*)(?=\sfrom|\sthen)'
protocol_re = r'(?<=protocol\s)(.*)'
dest_port_re = r'(?<=destination-port\s)(.*)'
source_port_re = r'(?<=from\ssource-port\s)(.*)'
prefix_source_re = r'(?<=from\ssource-prefix-list\s)(.*)'
prefix_dest_re = r'(?<=from\sdestination-prefix-list\s)(.*)'
source_addr_re = r'(?<=source-address\s)(.*)'
dest_addr_re = r'(?<=destination-address\s)(.*)'
action_re = r'(?<=then\s)(deny|accept)'
pattern_list = [filter_re, term_re, source_addr_re, prefix_source_re, source_port_re, dest_addr_re, prefix_dest_re,
dest_port_re, protocol_re, action_re]
pattern_headers = ["Filter", "Term", "SourceIP", "Source_Prefix_List", "Source_Port", "Destination_Address",
"DestinationList", "Destination_Port", "Protocol", "Action"]
def open_file(file):
final_dict: dict = dict()
final_list: list = list()
with open(file) as f:
for line in f:
for header, pattern in zip(pattern_headers, pattern_list):
match = re.search(pattern, line)
if match:
# check if accept or deny it means the end of the rule then empty dictionary
if str(match.group()) == "accept" or match.group() == "deny":
final_list.append(final_dict)
final_dict: dict = dict()
# if more than one SourceIP then create list of SourceIP
elif header == "SourceIP" and header in final_dict.keys():
final_dict[header] = [final_dict[header]]
final_dict.setdefault(header, final_dict[header]).append(match.group())
else:
final_dict[header] = match.group()
print("final_list=", final_list)
open_file(data_file)
輸出:
final_list= [{'Filter': 'INBOUND',
'Term': 'TEST',
'SourceIP': '1.1.1.1/32',
'DestinationList': 'test-list',
'Protocol': 'udp',
'Destination_Port': '53'
},
{'Filter': 'PROD',
'Term': 'LAN',
'SourceIP': ['4.4.4.4/32', '5.5.5.5/32'],
'Protocol': 'tcp',
'Destination_Port': '443'
}]
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.