[英]Python Fastest way to check and update Recursively file path if missing in DB
輸入文件路徑
示例:“/A/B/C/D/E/F”
如果不存在 go 則檢查路徑“/A/B/C/D/E/F”是否存在到下一步
從“/A/B/C/D/E/F”中刪除“/F”並檢查 system_file_paths 中的“/A/B/C/D/E/”
一個。 如果存在,則創建 F 路徑
灣。 返回 F_ID
c。 如果“/A/B/C/D/E”不存在,go 到下一步
從“/A/B/C/D/E/F”中刪除 /E/F 並檢查 system_file_paths 中存在的“/A/B/C/D”
一個。 如果存在,創建 E/F
灣。 返回 F_ID
c。 如果“/A/B/C/D”不存在,go 到下一步
從“/A/B/C/D/E/F”中刪除 /D/E/F 並檢查 system_file_paths 列表中的“/A/B/C”
一個。 如果存在,創建 D/E/F
灣。 返回 F_ID
c。 如果“/A/B/C”不存在,go 到下一步
做,直到最后一步 A,如果不存在,
我已經編寫了代碼,它按預期工作,但是性能正在降級,因為我有大量文件,比如 1000 的millons 和 file_path 長度
注意:我沒有嘗試在 OS 中創建任何目錄,而是在 Application DB 中進行更新
我的代碼:
# Random system generation input paths for testing
import uuid
def file_path_list(num):
file_path_list = []
temp_list = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h']
for i in range(num):
temp = "{0}".format(i)
for j in temp_list:
temp += "{0}".format(j)
file_path_list.append(temp)
temp_list = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h','1','2','3','4']
for i in range(num):
temp = "{0}".format(i)
for j in temp_list:
temp += "{0}".format(j)
file_path_list.append(temp)
temp_list = ['a', 'b', 'c', 'd', 'x', 'y', 'z']
for i in range(num):
temp = "{0}".format(i)
for j in temp_list:
temp += "{0}".format(j)
file_path_list.append(temp)
return file_path_list
path_trees = file_path_list(10)
system_file_paths = {}
for path in path_trees:
system_file_paths[path] = str(uuid.uuid4())
# ************ system paths generation completed for testing ***************
# Input file
file_path = 'A/B/C/D/E/F'
def update_db(path_name):
_id = str(uuid.uuid4())
system_file_paths[path_name] = _id
return _id
# checking File path exist
def check_file_path_exists(file_path):
if file_path in system_file_paths:
return True
return False
# Looking solution, to speed up process performance, by applying any suitable algorithm and reduces multiple checks, in this method
def process_to_update_db(file_path):
# Split file path
file_path_split = file_path.split("/")
# Iterate loop
for reverse_path_indent in range(len(file_path_split) -1 , -1, -1):
# If iterate until last value, create root path
if reverse_path_indent == 0:
if not check_file_path_exists(file_path_split[0]):
_id = update_db(file_path_split[0])
for path_indent in range(1, len(file_path_split)):
file_path = "/".join(file_path_split[0: path_indent + 1])
_id = update_db(file_path)
return _id
#_id = update_db(file_path_split[0])
file_path_check = "/".join(file_path_split[0: reverse_path_indent])
if not check_file_path_exists(file_path_check):
continue
else:
for path_indent in range(reverse_path_indent, len(file_path_split)):
file_path = "/".join(file_path_split[0: path_indent + 1])
_id = update_db(file_path)
return _id
if not check_file_path_exists(file_path):
_id = process_to_update_db(file_path)
print(system_file_paths)
# Cross Verify path is created
if check_file_path_exists(file_path):
print(0)
else:
print(1)
我做了更多的測試用例,並更正了我的代碼:
# Random system generation input paths for testing
import uuid, json
system_file_paths = {}
# ************ system paths generation completed for testing ***************
# Input file
file_path = [
'A',
'A/B/C/D/E/F',
'A/B/C/D/G/H/I/J/K',
'A/B/C/D/G/H/I/J/K/L/M/N/O/P/Q/R/S/T/U/V/W',
'A/X/Y',
'A/B/C/D/G/H/I/J/K/L/M/N/O/P/Q/R/S/T/U/V/W/Z',
'A/B/C/D/G/H/I/J/A/B/C/D',
'A/B/C/D/G/H/I/J/K/L/A/B/C/D/G/H/I/J/K/L/M/N/O/P/Q/R/S/T/U/V/W'
]
def update_db(path_name):
global create_loop_count
create_loop_count += 1
_id = str(uuid.uuid4())
system_file_paths[path_name] = _id
return _id
# checking File path exist
def check_file_path_exists(file_path):
if file_path in system_file_paths:
try:
return system_file_paths[file_path]
except KeyError:
return False
return False
# Looking solution, to speed up process performance, by applying any suitable algorithm and reduces multiple checks, in this method
def process_to_update_db_original(file_path):
global search_loop_count
# Split file path
file_path_split = file_path.split("/")
# Iterate loop
for reverse_path_indent in range(len(file_path_split) -1 , -1, -1):
search_loop_count += 1
# If iterate until last value, create root path
if reverse_path_indent == 0:
if not check_file_path_exists(file_path_split[0]):
_id = update_db(file_path_split[0])
for path_indent in range(1, len(file_path_split)):
file_path = "/".join(file_path_split[0: path_indent + 1])
_id = update_db(file_path)
return _id
#_id = update_db(file_path_split[0])
file_path_check = "/".join(file_path_split[0: reverse_path_indent])
if not check_file_path_exists(file_path_check):
continue
else:
for path_indent in range(reverse_path_indent, len(file_path_split)):
file_path = "/".join(file_path_split[0: path_indent + 1])
_id = update_db(file_path)
return _id
# Looking solution, to speed up process performance, by applying any suitable algorithm and reduces multiple checks, in this method
def process_to_update_db_new(file_path):
global search_loop_count
# first of all, check to see if the path already exist:
try:
return system_file_paths[file_path]
except KeyError:
# do things the hard way
# Split file path
if file_path:
file_path_split = [file_path]
if "/" in file_path:
file_path_split = file_path.split("/")
while '' in file_path_split:
file_path_split.remove('')
start_index = 0
end_index = len(file_path_split) - 1
current_path = ''
build_path_start_index = 0
# bias seach to the last directory minus one, assuming that most directory lists add a new path at the end
current_index = end_index - 1
# let's do a binary search on the paths to find where we need to start adding directories
while start_index != end_index:
search_loop_count += 1
for i in range(build_path_start_index, current_index+1):
if i != 0:
current_path += '/'
current_path += file_path_split[i]
#print("start_index: %d, end_index: %d, current_index: %d : %s"%(start_index,end_index,current_index,current_path))
if check_file_path_exists(current_path):
start_index = current_index+1
if start_index > end_index:
start_index = end_index
build_path_start_index = start_index
else:
current_path = ''
build_path_start_index = 0
end_index = current_index-1
if end_index < start_index:
end_index = start_index
current_index = int((end_index-start_index)/2) + start_index
# build up the existing path
for i in range(build_path_start_index, start_index+1):
if i != 0:
current_path += '/'
current_path += file_path_split[i]
# do the actual insertion of the UUID for new paths
for i in range(start_index, len(file_path_split)):
if i == start_index:
# check if current path exists
if check_file_path_exists(current_path):
# move on to next index
continue
else:
# current_path doesn't exist, create it below
pass
else:
# append next path item
if i != 0:
current_path += '/'
current_path += file_path_split[i]
current_id = update_db(current_path)
return current_id
for current_path in file_path:
if not check_file_path_exists(current_path):
search_loop_count = 0
create_loop_count = 0
_id = process_to_update_db_new(current_path)
# Cross Verify path is created
if check_file_path_exists(current_path):
print("SUCCESS")
else:
print("!!!FAILURE!!!")
print("search_loop_count: %d create_loop_count %d"%(search_loop_count, create_loop_count))
print(json.dumps(system_file_paths,indent=4,sort_keys=True))
create_loop_count
在您的原始和我的之間是相同的,但是搜索循環的數量有所不同:
| search_loop_count |
------------|-------------------|
Test String | Original | New |
------------|----------|--------|
1 | 1 | 0 |
2 | 5 | 2 |
3 | 5 | 3 |
4 | 12 | 5 |
5 | 2 | 1 |
6 | 1 | 1 |
7 | 4 | 4 |
8 | 21 | 5 |
如果路徑長度為數千,正如您在問題中所說,那么差異會很大。
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.