[英]What is the most efficient way to find all files containing given string, case-insensitive using Python 3
I have a folder with many subfolders and 1000+ text files (50-200mb per file) on different levels, and I want to find which files contain a particular string (regardless of the casing).我有一个文件夹,其中包含许多子文件夹和 1000 多个不同级别的文本文件(每个文件 50-200mb),我想找到哪些文件包含特定的字符串(无论大小写)。
How to implement this for best performance ?如何实现这一点以获得最佳性能? (python3.6.5+)
(python3.6.5+)
The task can be decomposed in to two sub-tasks:任务可以分解为两个子任务:
Get all the files in the folder.获取文件夹中的所有文件。
Check which files contain the string (case-insensitive).检查哪些文件包含字符串(不区分大小写)。
For the first task ( Get all the files in the folder ) I've written benchmark for couple of approaches to do it, and the results are below:对于第一个任务(获取文件夹中的所有文件),我为几种方法编写了基准测试,结果如下:
# test_get_files.py
from glob import glob
from os import walk
from os.path import isfile, join
from timeit import timeit
from tabulate import tabulate
FOLDER = r"/path/to/dir/with/many/nested/files"
def get_files_using_walk_top_down_list(folder):
return [join(parent, file)
for parent, _, filenames in walk(folder, topdown=True)
for file in filenames]
def get_files_using_walk_top_down_tuple(folder):
return tuple(join(parent, file)
for parent, _, filenames in walk(folder, topdown=True)
for file in filenames)
def get_files_using_walk_top_down_set(folder):
return {join(parent, file)
for parent, _, filenames in walk(folder, topdown=True)
for file in filenames}
def get_files_using_walk_bottom_up_list(folder):
return [join(parent, file)
for parent, _, filenames in walk(folder, topdown=False)
for file in filenames]
def get_files_using_walk_bottom_up_tuple(folder):
return tuple(join(parent, file)
for parent, _, filenames in walk(folder, topdown=False)
for file in filenames)
def get_files_using_walk_bottom_up_set(folder):
return {join(parent, file)
for parent, _, filenames in walk(folder, topdown=False)
for file in filenames}
def get_files_using_glob_list(folder):
return [path
for path in glob(f"{folder}/**/*", recursive=True)
if isfile(path)]
def get_files_using_glob_tuple(folder):
return tuple(path
for path in glob(f"{folder}/**/*", recursive=True)
if isfile(path))
def get_files_using_glob_set(folder):
return {path
for path in glob(f"{folder}/**/*", recursive=True)
if isfile(path)}
def benchmark(func, folder, number):
return timeit(stmt=f"{func.__name__}('{folder}')",
setup=f"from test_get_files import {func.__name__}",
number=number)
def main():
funcs = (get_files_using_glob_list,
get_files_using_glob_tuple,
get_files_using_glob_set,
get_files_using_walk_top_down_list,
get_files_using_walk_top_down_tuple,
get_files_using_walk_top_down_set,
get_files_using_walk_bottom_up_list,
get_files_using_walk_bottom_up_tuple,
get_files_using_walk_bottom_up_set)
numbers = (100, 500, 1000)
results = []
for number in numbers:
results.extend([(func.__name__, number, benchmark(func, FOLDER, number)) for func in funcs])
print(tabulate(results, headers="FUNC_NAME NUMBER TOOK".split()))
if __name__ == '__main__':
main()
The output is:输出是:
FUNC_NAME NUMBER TOOK
------------------------------------ -------- --------
get_files_using_glob_list 100 0.945687
get_files_using_glob_tuple 100 0.852411
get_files_using_glob_set 100 0.861514
get_files_using_walk_top_down_list 100 0.320643
get_files_using_walk_top_down_tuple 100 0.326478
get_files_using_walk_top_down_set 100 0.33721
get_files_using_walk_bottom_up_list 100 0.28824
get_files_using_walk_bottom_up_tuple 100 0.295585
get_files_using_walk_bottom_up_set 100 0.304363
get_files_using_glob_list 500 4.23723
get_files_using_glob_tuple 500 4.2692
get_files_using_glob_set 500 4.30241
get_files_using_walk_top_down_list 500 1.59499
get_files_using_walk_top_down_tuple 500 1.62841
get_files_using_walk_top_down_set 500 1.67612
get_files_using_walk_bottom_up_list 500 1.43197
get_files_using_walk_bottom_up_tuple 500 1.45971
get_files_using_walk_bottom_up_set 500 1.51071
get_files_using_glob_list 1000 8.42451
get_files_using_glob_tuple 1000 8.61827
get_files_using_glob_set 1000 8.60752
get_files_using_walk_top_down_list 1000 3.18595
get_files_using_walk_top_down_tuple 1000 3.24857
get_files_using_walk_top_down_set 1000 3.35619
get_files_using_walk_bottom_up_list 1000 2.86118
get_files_using_walk_bottom_up_tuple 1000 2.92635
get_files_using_walk_bottom_up_set 1000 3.01853
I think there's not much place for further improvement of how the files are collected.我认为没有太多地方可以进一步改进文件的收集方式。
get_files_using_walk_bottom_up_list wins the race. get_files_using_walk_bottom_up_list 赢得比赛。
Now for the second task ( Check which files contain the string (case-insensitive) ), Since when reading the complete file contents before doing the check, there is constant overhead of physically reading all bytes in the file, checking the file line-by-line and breaking the loop upon first matching line seems the most reasonable way to do it.现在进行第二个任务(检查哪些文件包含字符串(不区分大小写) ),因为在进行检查之前读取完整的文件内容时,物理读取文件中的所有字节、逐行检查文件都会产生持续的开销-line 并在第一个匹配行时打破循环似乎是最合理的方法。
Here is the benchmark code for checking the files in different ways:以下是以不同方式检查文件的基准代码:
# search_showcase.py
from os import walk
from os.path import join
from random import choice, randint
from timeit import timeit
from tabulate import tabulate
def get_files_using_walk_bottom_up_list(folder):
return [join(parent, file)
for parent, _, filenames in walk(folder, topdown=False)
for file in filenames]
def check_using_in(file, text):
with open(file, mode="r", encoding="utf-8") as fp:
for line in fp:
if text in line.lower():
return True
return False
def check_using_index(file, text):
with open(file, mode="r", encoding="utf-8") as fp:
for line in fp:
try:
line.lower().index(text)
except:
continue
else:
return True
return False
def check_using_find(file, text):
with open(file, mode="r", encoding="utf-8") as fp:
for line in fp:
if line.lower().find(text) != -1:
return True
return False
def get_files(files, check_func, text):
return [file
for file
in files
if check_func(file, text)]
def benchmark(func, files, text, number):
return timeit(stmt=f"get_files({files}, {func.__name__}, '{text}')",
setup=f"from search_showcase import get_files, {func.__name__}",
number=number)
def main():
text = "not-so-common-word"
folder = r"/path/to/files/dir"
funcs = (check_using_in, check_using_find, check_using_index)
# using some instead all files to reduce execution time
all_files = get_files_using_walk_bottom_up_list(folder)
some_files = [choice(all_files) for _ in range(randint(25, 50))]
print(f"selected {len(some_files)} of {len(all_files)} files\n")
# ensure the funcs discover same files before doing the benchmark
results_by_func = {func: get_files(some_files, func, text) for func in funcs}
assert (results_by_func[check_using_in] == results_by_func[check_using_find] == results_by_func[check_using_index])
# print the count of results to get some credibility
details = {func.__name__: len(results) for func, results in results_by_func.items()}
print(tabulate(tabular_data=[details], headers="keys"))
# do the benchmark
numbers = (5, 10)
benchmark_results = []
for number in numbers:
benchmark_results.extend(
[(func.__name__, number, benchmark(func, some_files, text, number)) for func in funcs])
# print the results
print(tabulate(benchmark_results, headers="FUNC_NAME NUMBER TOOK".split()))
if __name__ == '__main__':
main()
And here is the output:这是输出:
selected 41 of 764 files
check_using_in check_using_find check_using_index
---------------- ------------------ -------------------
13 13 13
FUNC_NAME NUMBER TOOK
----------------- -------- --------
check_using_in 5 0.475356
check_using_find 5 0.678626
check_using_index 5 1.23154
check_using_in 10 0.941205
check_using_find 10 1.35866
check_using_index 10 2.4155
Can you tell me if there is any faster way to write this check (without using milti-threading/processing)?你能告诉我是否有更快的方法来编写这个检查(不使用多线程/处理)?
I think this would be a perfect use case for multiprocessing module.我认为这将是多处理模块的完美用例。 Inside the process function, you can open the file do readlines(), check if string exists and return if string exists.
在 process 函数中,您可以打开文件 do readlines(),检查字符串是否存在,如果字符串存在则返回。
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.