[英]Python script use while loop to keep updating job scripts and multiprocess the tasks in queue
我正在嘗試編寫一個掃描文件夾的python腳本並收集更新的SQL腳本,然后自動為SQL腳本提取數據。 在代碼中,while循環是掃描新的SQL文件,並發送到數據拉取功能。 我無法理解如何使用while循環創建動態隊列,但也有多進程來運行隊列中的任務。
下面的代碼有一個問題,即while循環迭代在移動到下一次迭代之前將在長作業上工作並收集其他作業以填充空閑處理器。
更新:
感謝@pbacterio捕獲錯誤,現在錯誤消息消失了。 更改代碼后,python代碼可以在一次迭代中獲取所有作業腳本,並將腳本分發到四個處理器。 但是,如果要進行下一次迭代,掃描並提交新添加的作業腳本,它將會很長時間。 知道如何重建代碼嗎?
我終於找到了解決方案,請參閱下面的答案。 事實證明我正在尋找的是
the_queue =隊列()
the_pool = Pool(4,worker_main,(the_queue,))
對於那些偶然發現類似想法的人來說,下面是這個自動化腳本的整個架構,它將共享驅動器轉換為“用於SQL拉動的服務器”或任何其他作業隊列“服務器”。
一個。 python腳本auto_data_pull.py
如答案所示。 您需要添加自己的工作職能。
灣 一個'批處理腳本',其中包括:
啟動C:\\ Anaconda2 \\ python.exe C:\\ Users \\ bin \\ auto_data_pull.py
C。 添加由啟動計算機觸發的任務,運行“批處理腳本”即可。 有用。
Python代碼:
from glob import glob
import os, time
import sys
import CSV
import re
import subprocess
import pandas as PD
import pypyodbc
from multiprocessing import Process, Queue, current_process, freeze_support
#
# Function run by worker processes
#
def worker(input, output):
for func, args in iter(input.get, 'STOP'):
result = compute(func, args)
output.put(result)
#
# Function used to compute result
#
def compute(func, args):
result = func(args)
return '%s says that %s%s = %s' % \
(current_process().name, func.__name__, args, result)
def query_sql(sql_file): #test func
#jsl file processing and SQL querying, data table will be saved to csv.
fo_name = os.path.splitext(sql_file)[0] + '.csv'
fo = open(fo_name, 'w')
print sql_file
fo.write("sql_file {0} is done\n".format(sql_file))
return "Query is done for \n".format(sql_file)
def check_files(path):
"""
arguments -- root path to monitor
returns -- dictionary of {file: timestamp, ...}
"""
sql_query_dirs = glob(path + "/*/IDABox/")
files_dict = {}
for sql_query_dir in sql_query_dirs:
for root, dirs, filenames in os.walk(sql_query_dir):
[files_dict.update({(root + filename): os.path.getmtime(root + filename)}) for
filename in filenames if filename.endswith('.jsl')]
return files_dict
##### working in single thread
def single_thread():
path = "Y:/"
before = check_files(path)
sql_queue = []
while True:
time.sleep(3)
after = check_files(path)
added = [f for f in after if not f in before]
deleted = [f for f in before if not f in after]
overlapped = list(set(list(after)) & set(list(before)))
updated = [f for f in overlapped if before[f] < after[f]]
before = after
sql_queue = added + updated
# print sql_queue
for sql_file in sql_queue:
try:
query_sql(sql_file)
except:
pass
##### not working in queue
def multiple_thread():
NUMBER_OF_PROCESSES = 4
path = "Y:/"
sql_queue = []
before = check_files(path) # get the current dictionary of sql_files
task_queue = Queue()
done_queue = Queue()
while True: #while loop to check the changes of the files
time.sleep(5)
after = check_files(path)
added = [f for f in after if not f in before]
deleted = [f for f in before if not f in after]
overlapped = list(set(list(after)) & set(list(before)))
updated = [f for f in overlapped if before[f] < after[f]]
before = after
sql_queue = added + updated
TASKS = [(query_sql, sql_file) for sql_file in sql_queue]
# Create queues
#submit task
for task in TASKS:
task_queue.put(task)
for i in range(NUMBER_OF_PROCESSES):
p = Process(target=worker, args=(task_queue, done_queue)).start()
# try:
# p = Process(target=worker, args=(task_queue))
# p.start()
# except:
# pass
# Get and print results
print 'Unordered results:'
for i in range(len(TASKS)):
print '\t', done_queue.get()
# Tell child processes to stop
for i in range(NUMBER_OF_PROCESSES):
task_queue.put('STOP')
# single_thread()
if __name__ == '__main__':
# freeze_support()
multiple_thread()
參考:
你在哪里定義了multiple_thread()
中的sql_file
multiprocessing.Process(target=query_sql, args=(sql_file)).start()
您尚未在方法中定義sql_file
,而且您已在for循環中使用該變量。 變量的范圍僅限於for循環。
嘗試替換這個:
result = func(*args)
這樣:
result = func(args)
我已經想到了這一點。 感謝您的回應激發了這一想法。 現在,腳本可以運行while循環來監視文件夾以獲取新的更新/添加的SQL腳本,然后將數據分配到多個線程。 解決方案來自queue.get()和queue.put()。 我假設隊列對象自己處理通信。
這是最終的代碼 -
from glob import glob
import os, time
import sys
import pypyodbc
from multiprocessing import Process, Queue, Event, Pool, current_process, freeze_support
def query_sql(sql_file): #test func
#jsl file processing and SQL querying, data table will be saved to csv.
fo_name = os.path.splitext(sql_file)[0] + '.csv'
fo = open(fo_name, 'w')
print sql_file
fo.write("sql_file {0} is done\n".format(sql_file))
return "Query is done for \n".format(sql_file)
def check_files(path):
"""
arguments -- root path to monitor
returns -- dictionary of {file: timestamp, ...}
"""
sql_query_dirs = glob(path + "/*/IDABox/")
files_dict = {}
try:
for sql_query_dir in sql_query_dirs:
for root, dirs, filenames in os.walk(sql_query_dir):
[files_dict.update({(root + filename): os.path.getmtime(root + filename)}) for
filename in filenames if filename.endswith('.jsl')]
except:
pass
return files_dict
def worker_main(queue):
print os.getpid(),"working"
while True:
item = queue.get(True)
query_sql(item)
def main():
the_queue = Queue()
the_pool = Pool(4, worker_main,(the_queue,))
path = "Y:/"
before = check_files(path) # get the current dictionary of sql_files
while True: #while loop to check the changes of the files
time.sleep(5)
sql_queue = []
after = check_files(path)
added = [f for f in after if not f in before]
deleted = [f for f in before if not f in after]
overlapped = list(set(list(after)) & set(list(before)))
updated = [f for f in overlapped if before[f] < after[f]]
before = after
sql_queue = added + updated
if sql_queue:
for jsl_file in sql_queue:
try:
the_queue.put(jsl_file)
except:
print "{0} failed with error {1}. \n".format(jsl_file, str(sys.exc_info()[0]))
pass
else:
pass
if __name__ == "__main__":
main()
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.