簡體   English   中英

Python 如果在 DB 中丟失,檢查和更新遞歸文件路徑的最快方法

[英]Python Fastest way to check and update Recursively file path if missing in DB

輸入文件路徑

示例:“/A/B/C/D/E/F”

  1. 如果不存在 go 則檢查路徑“/A/B/C/D/E/F”是否存在到下一步

  2. 從“/A/B/C/D/E/F”中刪除“/F”並檢查 system_file_paths 中的“/A/B/C/D/E/”

    一個。 如果存在,則創建 F 路徑

    灣。 返回 F_ID

    c。 如果“/A/B/C/D/E”不存在,go 到下一步

  3. 從“/A/B/C/D/E/F”中刪除 /E/F 並檢查 system_file_paths 中存在的“/A/B/C/D”

    一個。 如果存在,創建 E/F

    灣。 返回 F_ID

    c。 如果“/A/B/C/D”不存在,go 到下一步

  4. 從“/A/B/C/D/E/F”中刪除 /D/E/F 並檢查 system_file_paths 列表中的“/A/B/C”

    一個。 如果存在,創建 D/E/F

    灣。 返回 F_ID

    c。 如果“/A/B/C”不存在,go 到下一步

做,直到最后一步 A,如果不存在,

我已經編寫了代碼,它按預期工作,但是性能正在降級,因為我有大量文件,比如 1000 的millons 和 file_path 長度

注意:我沒有嘗試在 OS 中創建任何目錄,而是在 Application DB 中進行更新

我的代碼:

# Random system generation input paths for testing 
import uuid
def file_path_list(num):
    file_path_list = []
    temp_list = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h']
    for i in range(num):
        temp = "{0}".format(i)
        for j in temp_list:
            temp += "{0}".format(j)
        file_path_list.append(temp)

    temp_list = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h','1','2','3','4']
    for i in range(num):
        temp = "{0}".format(i)
        for j in temp_list:
            temp += "{0}".format(j)

        file_path_list.append(temp)

    temp_list = ['a', 'b', 'c', 'd', 'x', 'y', 'z']
    for i in range(num):
        temp = "{0}".format(i)
        for j in temp_list:
            temp += "{0}".format(j)
        file_path_list.append(temp)
    

    return file_path_list

path_trees = file_path_list(10)

system_file_paths = {}
for path in path_trees:
    system_file_paths[path] = str(uuid.uuid4())

# ************             system paths generation completed  for testing       ***************

# Input file 
file_path = 'A/B/C/D/E/F'

def update_db(path_name):
    _id = str(uuid.uuid4())
    system_file_paths[path_name] = _id
    return _id


# checking File path exist
def check_file_path_exists(file_path):
    if file_path in system_file_paths:
        return True
    return False

# Looking solution, to speed up process performance, by applying any suitable algorithm and reduces multiple checks, in this method
def process_to_update_db(file_path):
    # Split file path 
    file_path_split = file_path.split("/")
    # Iterate loop
    for reverse_path_indent in range(len(file_path_split) -1 , -1, -1):

        # If iterate until last value, create root path
        if reverse_path_indent == 0:
            if not check_file_path_exists(file_path_split[0]):
                _id = update_db(file_path_split[0])
            for path_indent in range(1, len(file_path_split)):
                file_path = "/".join(file_path_split[0: path_indent + 1])
                _id = update_db(file_path)
            return _id
            #_id = update_db(file_path_split[0])

        file_path_check = "/".join(file_path_split[0: reverse_path_indent])
        if not check_file_path_exists(file_path_check):
            continue
        else:
            for path_indent in range(reverse_path_indent, len(file_path_split)):
                file_path = "/".join(file_path_split[0: path_indent + 1])
                _id = update_db(file_path)
            return _id





if not check_file_path_exists(file_path):
    _id = process_to_update_db(file_path)
    print(system_file_paths)
    # Cross Verify path is created
    if check_file_path_exists(file_path):
        print(0)
    else:
        print(1)
    

我做了更多的測試用例,並更正了我的代碼:

# Random system generation input paths for testing 
import uuid, json

system_file_paths = {}

# ************             system paths generation completed  for testing       ***************

# Input file
file_path = [ 
    'A',
    'A/B/C/D/E/F',
    'A/B/C/D/G/H/I/J/K',
    'A/B/C/D/G/H/I/J/K/L/M/N/O/P/Q/R/S/T/U/V/W',
    'A/X/Y',
    'A/B/C/D/G/H/I/J/K/L/M/N/O/P/Q/R/S/T/U/V/W/Z',
    'A/B/C/D/G/H/I/J/A/B/C/D',
    'A/B/C/D/G/H/I/J/K/L/A/B/C/D/G/H/I/J/K/L/M/N/O/P/Q/R/S/T/U/V/W'
]

def update_db(path_name):
    global create_loop_count
    create_loop_count += 1
    _id = str(uuid.uuid4())
    system_file_paths[path_name] = _id
    return _id


# checking File path exist
def check_file_path_exists(file_path):
    if file_path in system_file_paths:
        try:
            return system_file_paths[file_path]
        except KeyError:
            return False
    return False

# Looking solution, to speed up process performance, by applying any suitable algorithm and reduces multiple checks, in this method
def process_to_update_db_original(file_path):
    global search_loop_count
    # Split file path 
    file_path_split = file_path.split("/")
    # Iterate loop
    for reverse_path_indent in range(len(file_path_split) -1 , -1, -1):
        search_loop_count += 1
        # If iterate until last value, create root path
        if reverse_path_indent == 0:
            if not check_file_path_exists(file_path_split[0]):
                _id = update_db(file_path_split[0])
            for path_indent in range(1, len(file_path_split)):
                file_path = "/".join(file_path_split[0: path_indent + 1])
                _id = update_db(file_path)
            return _id
            #_id = update_db(file_path_split[0])

        file_path_check = "/".join(file_path_split[0: reverse_path_indent])
        if not check_file_path_exists(file_path_check):
            continue
        else:
            for path_indent in range(reverse_path_indent, len(file_path_split)):
                file_path = "/".join(file_path_split[0: path_indent + 1])
                _id = update_db(file_path)
            return _id

# Looking solution, to speed up process performance, by applying any suitable algorithm and reduces multiple checks, in this method
def process_to_update_db_new(file_path):
    global search_loop_count
    # first of all, check to see if the path already exist:
    try:
        return system_file_paths[file_path]
    except KeyError:
        # do things the hard way
        # Split file path 
        if file_path:
            file_path_split = [file_path]
        if "/" in file_path:
            file_path_split = file_path.split("/")
        while '' in file_path_split:
            file_path_split.remove('')
        start_index = 0
        end_index = len(file_path_split) - 1
        current_path = ''
        build_path_start_index = 0
        # bias seach to the last directory minus one, assuming that most directory lists add a new path at the end
        current_index = end_index - 1

        # let's do a binary search on the paths to find where we need to start adding directories
        while start_index != end_index:
            search_loop_count += 1

            for i in range(build_path_start_index, current_index+1):
                if i != 0:
                    current_path += '/'
                current_path += file_path_split[i]

            #print("start_index: %d, end_index: %d, current_index: %d : %s"%(start_index,end_index,current_index,current_path))

            if check_file_path_exists(current_path):
                start_index = current_index+1
                if start_index > end_index:
                    start_index = end_index
                build_path_start_index = start_index
            else:
                current_path = ''
                build_path_start_index = 0
                end_index = current_index-1
                if end_index < start_index:
                    end_index = start_index

            current_index = int((end_index-start_index)/2) + start_index

        # build up the existing path
        for i in range(build_path_start_index, start_index+1):
            if i != 0:
                current_path += '/'
            current_path += file_path_split[i]

        # do the actual insertion of the UUID for new paths
        for i in range(start_index, len(file_path_split)):
            if i == start_index:
                # check if current path exists
                if check_file_path_exists(current_path):
                    # move on to next index
                    continue
                else:
                    # current_path doesn't exist, create it below
                    pass
            else:
                # append next path item
                if i != 0:
                    current_path += '/'
                current_path += file_path_split[i]

            current_id = update_db(current_path)

        return current_id


for current_path in file_path:
    if not check_file_path_exists(current_path):
        search_loop_count = 0
        create_loop_count = 0
        _id = process_to_update_db_new(current_path)
        # Cross Verify path is created
        if check_file_path_exists(current_path):
            print("SUCCESS")
        else:
            print("!!!FAILURE!!!")
        print("search_loop_count: %d create_loop_count %d"%(search_loop_count, create_loop_count))

print(json.dumps(system_file_paths,indent=4,sort_keys=True))


create_loop_count在您的原始和我的之間是相同的,但是搜索循環的數量有所不同:

            | search_loop_count |
------------|-------------------|
Test String | Original |  New   |
------------|----------|--------|
    1       |    1     |   0    |
    2       |    5     |   2    |
    3       |    5     |   3    |
    4       |    12    |   5    |
    5       |    2     |   1    |
    6       |    1     |   1    |
    7       |    4     |   4    |
    8       |    21    |   5    |

如果路徑長度為數千,正如您在問題中所說,那么差異會很大。

暫無
暫無

聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.

 
粵ICP備18138465號  © 2020-2024 STACKOOM.COM