Thread pool implementation using pthreads

Question

I am trying to understand the below implementation of thread pool using the pthreads. When I comment out the the for loop in the main, the program stucks, upon putting the logs it seems that its getting stuck in the join function in threadpool destructor.

I am unable to understand why this is happening, is there any deadlock scenario happening ?

This may be naive but can someone help me understand why this is happening and how to correct this.

Thanks a lot !!!

#include <stdio.h>
#include <queue>
#include <unistd.h>
#include <pthread.h>
#include <malloc.h>
#include <stdlib.h>

// Base class for Tasks
// run() should be overloaded and expensive calculations done there
// showTask() is for debugging and can be deleted if not used
class Task {
public:
    Task() {}
    virtual ~Task() {}
    virtual void run()=0;
    virtual void showTask()=0;
};

// Wrapper around std::queue with some mutex protection
class WorkQueue {
public:
WorkQueue() {
    // Initialize the mutex protecting the queue
    pthread_mutex_init(&qmtx,0);

    // wcond is a condition variable that's signaled
    // when new work arrives
    pthread_cond_init(&wcond, 0);
}

~WorkQueue() {
    // Cleanup pthreads
    pthread_mutex_destroy(&qmtx);
    pthread_cond_destroy(&wcond);
}
// Retrieves the next task from the queue
Task *nextTask() {
    // The return value
    Task *nt = 0;

    // Lock the queue mutex
    pthread_mutex_lock(&qmtx);
    // Check if there's work
    if (finished && tasks.size() == 0) {
        // If not return null (0)
        nt = 0;
    } else {
        // Not finished, but there are no tasks, so wait for
        // wcond to be signalled
        if (tasks.size()==0) {
            pthread_cond_wait(&wcond, &qmtx);
        }
        // get the next task
        nt = tasks.front();
        if(nt){
        tasks.pop();
    }

        // For debugging
        if (nt) nt->showTask();
    }
    // Unlock the mutex and return
    pthread_mutex_unlock(&qmtx);
    return nt;
}
// Add a task
void addTask(Task *nt) {
    // Only add the task if the queue isn't marked finished
    if (!finished) {
        // Lock the queue
        pthread_mutex_lock(&qmtx);
        // Add the task
        tasks.push(nt);
        // signal there's new work
        pthread_cond_signal(&wcond);
        // Unlock the mutex
        pthread_mutex_unlock(&qmtx);
    }
}
// Mark the queue finished
void finish() {
    pthread_mutex_lock(&qmtx);
    finished = true;
    // Signal the condition variable in case any threads are waiting
    pthread_cond_signal(&wcond);
    pthread_mutex_unlock(&qmtx);
}

// Check if there's work
bool hasWork() {
//printf("task queue size is %d\n",tasks.size());
    return (tasks.size()>0);
}

private:
std::queue<Task*> tasks;
bool finished;
pthread_mutex_t qmtx;
pthread_cond_t wcond;
};

// Function that retrieves a task from a queue, runs it and deletes it
void *getWork(void* param) {
Task *mw = 0;
WorkQueue *wq = (WorkQueue*)param;
while (mw = wq->nextTask()) {
    mw->run();
    delete mw;
}
pthread_exit(NULL);
}

class ThreadPool {
public:
// Allocate a thread pool and set them to work trying to get tasks
ThreadPool(int n) : _numThreads(n) {
int rc;
    printf("Creating a thread pool with %d threads\n", n);
    threads = new pthread_t[n];
    for (int i=0; i< n; ++i) {
        rc = pthread_create(&(threads[i]), 0, getWork, &workQueue);
    if (rc){
     printf("ERROR; return code from pthread_create() is %d\n", rc);
     exit(-1);
        }
    }
}

// Wait for the threads to finish, then delete them
~ThreadPool() {
    workQueue.finish();
    //waitForCompletion();
    for (int i=0; i<_numThreads; ++i) {
        pthread_join(threads[i], 0);
    }
    delete [] threads;
}

// Add a task
void addTask(Task *nt) {
    workQueue.addTask(nt);
}
// Tell the tasks to finish and return
void finish() {
    workQueue.finish();
}

// Checks if there is work to do
bool hasWork() {
    return workQueue.hasWork();
}

private:
pthread_t * threads;
int _numThreads;
WorkQueue workQueue;
};

// stdout is a shared resource, so protected it with a mutex
static pthread_mutex_t console_mutex = PTHREAD_MUTEX_INITIALIZER;

// Debugging function
void showTask(int n) {
pthread_mutex_lock(&console_mutex);
pthread_mutex_unlock(&console_mutex);
}

// Task to compute fibonacci numbers
// It's more efficient to use an iterative algorithm, but
// the recursive algorithm takes longer and is more interesting
// than sleeping for X seconds to show parrallelism
class FibTask : public Task {
public:
FibTask(int n) : Task(), _n(n) {}
~FibTask() {
    // Debug prints
    pthread_mutex_lock(&console_mutex);
    printf("tid(%d) - fibd(%d) being deleted\n", pthread_self(), _n);
    pthread_mutex_unlock(&console_mutex);        
}
virtual void run() {
    // Note: it's important that this isn't contained in the console mutex lock
    long long val = innerFib(_n);
    // Show results
    pthread_mutex_lock(&console_mutex);
    printf("Fibd %d = %lld\n",_n, val);
    pthread_mutex_unlock(&console_mutex);


    // The following won't work in parrallel:
    //   pthread_mutex_lock(&console_mutex);
    //   printf("Fibd %d = %lld\n",_n, innerFib(_n));
    //   pthread_mutex_unlock(&console_mutex);
}
virtual void showTask() {
    // More debug printing
    pthread_mutex_lock(&console_mutex);
    printf("thread %d computing fibonacci %d\n", pthread_self(), _n);
    pthread_mutex_unlock(&console_mutex);        
}
private:
// Slow computation of fibonacci sequence
// To make things interesting, and perhaps imporove load balancing, these
// inner computations could be added to the task queue
// Ideally set a lower limit on when that's done
// (i.e. don't create a task for fib(2)) because thread overhead makes it
// not worth it
long long innerFib(long long n) {
    if (n<=1) { return 1; }
    return innerFib(n-1) + innerFib(n-2);
}
long long _n;
};

int main(int argc, char *argv[]) 
{
// Create a thread pool
ThreadPool *tp = new ThreadPool(10);

// Create work for it
/*for (int i=0;i<100; ++i) {
    int rv = rand() % 40 + 1;
    showTask(rv);
    tp->addTask(new FibTask(rv));
}*/
delete tp;

printf("\n\n\n\n\nDone with all work!\n");
}

Answer 1

The design is more or less OK-ish but implementationwise it contains several things that are a bit overcomplicated and may introduce instabilities. I guess you prog deadlocks when you comment out the for loop because you should use pthread_cond_broadcast instead of pthread_cond_signal in your WorkQueue::finish() method.

Note: I usually implemented threadpool termination by placing NUM_THREADS number of NULL items into the workqueue and I set a finished flag only to be able to check something in my addTask() method because after finish() I usually don't let adding new tasks and I return with false from addTask() or sometimes I assert.

Another note: Its best to encapsulate threads into classes, that has several benefits and makes proting to multiple platforms easier.

There may be other bugs too as I haven't executed your program, just ran through your code.

EDIT: Here is a reworked version, I issued some modifications to your code but I don't guarantee that it works. Fingers crossed... :-)

#include <stdio.h>
#include <queue>
#include <unistd.h>
#include <pthread.h>
#include <malloc.h>
#include <stdlib.h>
#include <assert.h>

// Reusable thread class
class Thread
{
public:
    Thread()
    {
        state = EState_None;
        handle = 0;
    }

    virtual ~Thread()
    {
        assert(state != EState_Started);
    }

    void start()
    {
        assert(state == EState_None);
        // in case of thread create error I usually FatalExit...
        if (pthread_create(&handle, NULL, threadProc, this))
            abort();
        state = EState_Started;
    }

    void join()
    {
        // A started thread must be joined exactly once!
        // This requirement could be eliminated with an alternative implementation but it isn't needed.
        assert(state == EState_Started);
        pthread_join(handle, NULL);
        state = EState_Joined;
    }

protected:
    virtual void run() = 0;

private:
    static void* threadProc(void* param)
    {
        Thread* thread = reinterpret_cast<Thread*>(param);
        thread->run();
        return NULL;
    }

private:
    enum EState
    {
        EState_None,
        EState_Started,
        EState_Joined
    };

    EState state;
    pthread_t handle;
};


// Base task for Tasks
// run() should be overloaded and expensive calculations done there
// showTask() is for debugging and can be deleted if not used
class Task {
public:
    Task() {}
    virtual ~Task() {}
    virtual void run()=0;
    virtual void showTask()=0;
};


// Wrapper around std::queue with some mutex protection
class WorkQueue
{
public:
    WorkQueue() {
        pthread_mutex_init(&qmtx,0);

        // wcond is a condition variable that's signaled
        // when new work arrives
        pthread_cond_init(&wcond, 0);
    }

    ~WorkQueue() {
        // Cleanup pthreads
        pthread_mutex_destroy(&qmtx);
        pthread_cond_destroy(&wcond);
    }

    // Retrieves the next task from the queue
    Task *nextTask() {
        // The return value
        Task *nt = 0;

        // Lock the queue mutex
        pthread_mutex_lock(&qmtx);

        while (tasks.empty())
            pthread_cond_wait(&wcond, &qmtx);

        nt = tasks.front();
        tasks.pop();

        // Unlock the mutex and return
        pthread_mutex_unlock(&qmtx);
        return nt;
    }
    // Add a task
    void addTask(Task *nt) {
        // Lock the queue
        pthread_mutex_lock(&qmtx);
        // Add the task
        tasks.push(nt);
        // signal there's new work
        pthread_cond_signal(&wcond);
        // Unlock the mutex
        pthread_mutex_unlock(&qmtx);
    }

private:
    std::queue<Task*> tasks;
    pthread_mutex_t qmtx;
    pthread_cond_t wcond;
};

// Thanks to the reusable thread class implementing threads is
// simple and free of pthread api usage.
class PoolWorkerThread : public Thread
{
public:
    PoolWorkerThread(WorkQueue& _work_queue) : work_queue(_work_queue) {}
protected:
    virtual void run()
    {
        while (Task* task = work_queue.nextTask())
            task->run();
    }
private:
    WorkQueue& work_queue;
};

class ThreadPool {
public:
    // Allocate a thread pool and set them to work trying to get tasks
    ThreadPool(int n) {
        printf("Creating a thread pool with %d threads\n", n);
        for (int i=0; i<n; ++i)
        {
            threads.push_back(new PoolWorkerThread(workQueue));
            threads.back()->start();
        }
    }

    // Wait for the threads to finish, then delete them
    ~ThreadPool() {
        finish();
    }

    // Add a task
    void addTask(Task *nt) {
        workQueue.addTask(nt);
    }

    // Asking the threads to finish, waiting for the task
    // queue to be consumed and then returning.
    void finish() {
        for (size_t i=0,e=threads.size(); i<e; ++i)
            workQueue.addTask(NULL);
        for (size_t i=0,e=threads.size(); i<e; ++i)
        {
            threads[i]->join();
            delete threads[i];
        }
        threads.clear();
    }

private:
    std::vector<PoolWorkerThread*> threads;
    WorkQueue workQueue;
};

// stdout is a shared resource, so protected it with a mutex
static pthread_mutex_t console_mutex = PTHREAD_MUTEX_INITIALIZER;

// Debugging function
void showTask(int n) {
    pthread_mutex_lock(&console_mutex);
    pthread_mutex_unlock(&console_mutex);
}

// Task to compute fibonacci numbers
// It's more efficient to use an iterative algorithm, but
// the recursive algorithm takes longer and is more interesting
// than sleeping for X seconds to show parrallelism
class FibTask : public Task {
public:
    FibTask(int n) : Task(), _n(n) {}
    ~FibTask() {
        // Debug prints
        pthread_mutex_lock(&console_mutex);
        printf("tid(%d) - fibd(%d) being deleted\n", (int)pthread_self(), (int)_n);
        pthread_mutex_unlock(&console_mutex);        
    }
    virtual void run() {
        // Note: it's important that this isn't contained in the console mutex lock
        long long val = innerFib(_n);
        // Show results
        pthread_mutex_lock(&console_mutex);
        printf("Fibd %d = %lld\n",(int)_n, val);
        pthread_mutex_unlock(&console_mutex);


        // The following won't work in parrallel:
        //   pthread_mutex_lock(&console_mutex);
        //   printf("Fibd %d = %lld\n",_n, innerFib(_n));
        //   pthread_mutex_unlock(&console_mutex);

        // this thread pool implementation doesn't delete
        // the tasks so we perform the cleanup here
        delete this;
    }
    virtual void showTask() {
        // More debug printing
        pthread_mutex_lock(&console_mutex);
        printf("thread %d computing fibonacci %d\n", (int)pthread_self(), (int)_n);
        pthread_mutex_unlock(&console_mutex);        
    }
private:
    // Slow computation of fibonacci sequence
    // To make things interesting, and perhaps imporove load balancing, these
    // inner computations could be added to the task queue
    // Ideally set a lower limit on when that's done
    // (i.e. don't create a task for fib(2)) because thread overhead makes it
    // not worth it
    long long innerFib(long long n) {
        if (n<=1) { return 1; }
        return innerFib(n-1) + innerFib(n-2);
    }
    long long _n;
};

int main(int argc, char *argv[]) 
{
    // Create a thread pool
    ThreadPool *tp = new ThreadPool(10);

    // Create work for it
    for (int i=0;i<100; ++i) {
        int rv = rand() % 40 + 1;
        showTask(rv);
        tp->addTask(new FibTask(rv));
    }
    delete tp;

    printf("\n\n\n\n\nDone with all work!\n");
}

Answer 2

I think you are having a race condition there...

When you remove the for loop, the pool is destructed as soon as it gets created so there is no time for the threads to start waiting on the queue. Try putting a sleep there and you'll see.

I implemented a threadpool library, which is used widely among all our services, so here come some advices:

You are using C++, so there's no need to use pthreads, just use boost, or std:thread if available
Don't signal, push empty tasks instead (pushing a task requires to signal, of course)
Use boost::function or std:function instead of a base class
Cope with spurious wake-ups (you code doesn't seem to handle them)
pthread_cond_signal wakes-up only one thread, you must use pthread_cond_broadcast if you want to notify them all, that said, I'd recommend, again, to stick to boost's conditions (@pasztorpisti got it rigth here, he's got my upvote)

Thread pool implementation using pthreads

Question

2 answers

solution1
5 2014-03-09 17:32:27

solution2
0 2014-03-09 17:50:16

Thread pool implementation using pthreads

Question

2 answers

solution1 5 2014-03-09 17:32:27

solution2 0 2014-03-09 17:50:16

solution1
5 2014-03-09 17:32:27

solution2
0 2014-03-09 17:50:16