I am currently writing a multithreaded client for making HTTP requests with a rate-limit, and for that I am using a 'manager' Process, that handles the ratelimit and enqueues new requests into the Pool.
For some reason, the queue doesn't seem to get synchronized between the class, the manager, and the worker processes.
from multiprocessing import JoinableQueue, Pool, Value
from multiprocessing.dummy import Process
from time import time
from typing import Callable
import requests
from requests import Request, Response
class HTTPWorkerPool:
def __init__(self, requests: int, period: float, processes: int = None, daemon: bool = False):
self._pool = Pool(processes=processes)
self._queue = JoinableQueue()
self.rps = Value('f', 0.0)
self._running = Value('b', True)
self._manager = Process(name='fastclient-manager', target=self._manager_, args=(period,
requests, self._pool, self._queue, self.rps, self._running))
self._manager.daemon = True
self._manager.start()
def __del__(self):
self.join()
def _manager_(self, period, requests, pool, queue, rps, running):
limited = False
current_requests = 0
last_clear = 0.0
while running.value:
if queue.empty():
continue # keep waiting for input. If join wasn't called, this will still be used.
if current_requests >= requests:
limited = True
current_time = time()
if last_clear + period <= current_time:
rps.value = current_requests/(current_time-last_clear)
last_clear = current_time
limited = False
current_requests = 0
if not limited:
print(f'in queue {queue.qsize()}')
pool.apply_async(self._worker, queue)
current_requests += 1
def _worker(self, queue: JoinableQueue):
(req, cb) = queue.get()
cb(requests.send(req.prepare()))
queue.task_done()
def join(self):
self._queue.close()
self._queue.join()
self._running.value = False
self._manager.join()
self._pool.close()
self._pool.terminate()
self._pool.join()
def submit(self, request: Request, callback: Callable[[Response], None]):
self._queue.put((request, callback))
and for testing
from time import sleep
from fastclient import HTTPWorkerPool
from requests import Request
def cb(res):
print(res.text)
if __name__ == '__main__':
pool = HTTPWorkerPool(10, 1)
for _ in range(100):
pool.submit(Request(method='GET', url='https://httpbin.org/get'), cb)
for _ in range(10):
sleep(1)
print(pool.rps.value)
The output is a bunch of 100
s (the queue length) and every second 9.9...
(the requests-per-second).
The queue length keeps staying at 100
and doesn't decrease.
Does someone know, how I can properly synchronise the queue(s), in order to have the tasks completed?
There are a few mistakes here.
Inside _manager_
you are creating tasks inside the pool with the target as self._worker
. For this to work, the self
would need to be pickled as well. However, your instance is unpicklable since it contains an object of Pool
and a an object of Process
. The only reason the instance was pickled when creating the _manager_
process was because the Process
handles pickling a little different from the Pool
class.
So, since _worker
does not use any instance attributes anyway, you can make it a function outside the class, or a staticmethod of the class.
You cannot share queues as arguments to processes in pools:
from multiprocessing import JoinableQueue, Pool
def a(pool):
pass
if __name__ == "__main__":
p = Pool(1)
q = JoinableQueue()
p1 = p.apply_async(a, (q,))
p1.get()
Output
RuntimeError: JoinableQueue objects should only be shared between processes through inheritance
You can however, create a queue through managers which basically share a proxy of the queue, instead of the actual queue, allowing it to be pickled.
from multiprocessing import JoinableQueue, Pool, Manager
def a(pool):
print('all done')
if __name__ == "__main__":
m = Manager()
p = Pool(1)
q = m.JoinableQueue()
p1 = p.apply_async(a, (q,))
Output
all done
This probably forms the crux of the issue. Your code is actually constantly returning errors whenever you try try and spawn a _worker
, but you will never know that since you haven't specified an error callback , and neither are you doing .get()
to the tasks you spawn using the pool.
When you create a task using apply_async
, the return value is an AsycnResult , which acts similar to a promise. The results (or exceptions raised) are only provided to you after you wait for this pseudo-promise to resolve using .get()
, or, by using callbacks, for both , errors and successful results. Since you are doing neither, you will never know what actually happened when you spawned the worker except the fact that it did not work.
Firstly , the function pool.apply_async
accepts the second argument as an iterable of arguments which will be unpacked to the target function. Even if the target function accepts only one argument, they still need to be an iterable when passed to apply_async
. Therefore, this line:
pool.apply_async(type(self)._worker, queue)
Should become this:
pool.apply_async(type(self)._worker, (queue,))
Secondly , when you are sending prepared requests using requests
, you need to send them using a session. Hence this line:
cb(requests.send(req.prepare()))
Should become this:
cb(requests.Session().send(req.prepare()))
Fixing all these errors will result in your code somewhat like this:
from multiprocessing import JoinableQueue, Pool, Value, Manager
from multiprocessing.dummy import Process
from time import time
from typing import Callable
from queue import Queue
import requests
from requests import Request, Response
class HTTPWorkerPool:
def __init__(self, requests: int, period: float, processes: int = None, daemon: bool = False):
manager = Manager()
self._pool = Pool(processes=processes)
self._queue = manager.JoinableQueue()
self.rps = Value('f', 0.0)
self._running = Value('b', True)
self._manager = Process(name='fastclient-manager', target=self._manager_, args=(period,
requests, self._pool,
self._queue, self.rps,
self._running))
self._manager.daemon = True
self._manager.start()
def __del__(self):
self.join()
def _manager_(self, period, requests, pool, queue, rps, running):
limited = False
current_requests = 0
last_clear = 0.0
while running.value:
if queue.empty():
continue # keep waiting for input. If join wasn't called, this will still be used.
if current_requests >= requests:
limited = True
current_time = time()
if last_clear + period <= current_time:
rps.value = current_requests / (current_time - last_clear)
last_clear = current_time
limited = False
current_requests = 0
if not limited:
print(f'in queue {queue.qsize()}')
pool.apply_async(type(self)._worker, (queue,))
current_requests += 1
@staticmethod
def _worker(queue: JoinableQueue):
(req, cb) = queue.get()
cb(requests.Session().send(req.prepare()))
queue.task_done()
def join(self):
self._queue.close()
self._queue.join()
self._running.value = False
self._manager.join()
self._pool.close()
self._pool.terminate()
self._pool.join()
def submit(self, request: Request, callback: Callable[[Response], None]):
self._queue.put((request, callback))
Don't forget to pass an error callback to help in debugging in future.
A few comments:
empty
called on a multiprocessing.Queue
or multiprocessing.JoinableQueue
is not reliable and should not be used.requests
does not have a function named send
. Consider using the following simpler RateLimitedProcessPool
or RateLimitedThreadPool
classes, which are more general purpose, as a starting point:
import multiprocessing.pool
import multiprocessing
import threading
from functools import wraps
import time
class RateLimitedPool:
# There is an a lag between the first call to apply_async and the first task actually starting.
# Err on the side of this being too large:
LAG_TIME = .2 # seconds - needs to be fine-tuned:
def __init__(self, rate, per):
assert isinstance(rate, int) and rate > 0
assert isinstance(per, (int, float)) and per > 0
self.rate = rate
self.per = per
self.count = 0
self.start_time = None
def _check_allowed(self):
if self.start_time is None: # first time
self.start_time = time.time() + self.LAG_TIME
self.count = 1
return
if self.count < self.rate:
self.count += 1
return
# Start of a new batch:
self.count = 1
current_time = time.time()
time_to_wait = self.per - (current_time - self.start_time)
if time_to_wait > 0:
time.sleep(time_to_wait)
current_time = time.time()
self.start_time = current_time
def apply_async(self, *args, **kwargs):
self._check_allowed()
return super().apply_async(*args, **kwargs)
class RateLimitedProcessPool(RateLimitedPool, multiprocessing.pool.Pool):
def __init__(self, *args, rate=5, per=1, **kwargs):
multiprocessing.pool.Pool.__init__(self, *args, **kwargs)
RateLimitedPool.__init__(self, rate, per)
class RateLimitedThreadPool(RateLimitedPool, multiprocessing.pool.ThreadPool):
def __init__(self, *args, rate=5, per=1, **kwargs):
multiprocessing.pool.Pool.__init__(self, *args, **kwargs)
RateLimitedPool.__init__(self, rate, per)
########################################
from requests import Request, Response, Session
def cb(res):
print(res.text)
def worker(session, req):
cb(session.send(req.prepare()))
def main():
#pool = RateLimitedProcessPool(rate=10, per=1) # 10 requests per 1 second
pool = RateLimitedThreadPool(10, rate=10, per=1) # 10 requests per 1 second
start = time.time()
session = Session()
for _ in range(100):
pool.apply_async(worker, args=(session, Request(method='GET', url='https://httpbin.org/get'),))
# Wait for all tasks to complete
pool.close()
pool.join()
print('Total elapsed time:', time.time() - start)
if __name__ == '__main__':
main()
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.