Output between single-threaded and multi-threaded versions of same application differs [Python]

Question

I have written 2 versions of a program to parse a log file and return the number of strings that match a given regex. The single-threaded version return the correct output

Number of Orders ('ORDER'): 1108
Number of Replacements ('REPLACE'): 742
Number of Orders and Replacements: 1850
Time to process: 5.018553

The multithreaded program however returns erroneous values:

Number of Orders ('ORDER'): 1579
Number of Replacements ('REPLACE'): 1108
Number of Orders and Replacements: 2687
Time to process: 2.783091

The time can vary (it should be faster for the multithreaded one) but I can't seem to find why the values for orders and replacements differ between the two versions.

Here is the multithreaded version:

import re
import time
import sys
import threading
import Queue

class PythonLogParser:
    queue = Queue.Queue()

    class FileParseThread(threading.Thread):

        def __init__(self, parsefcn, f, startind, endind, olist):
            threading.Thread.__init__(self)
            self.parsefcn = parsefcn
            self.startind = startind
            self.endind = endind
            self.olist = olist
            self.f = f

        def run(self):
            self.parsefcn(self.f, self.startind, self.endind, self.olist)

    def __init__(self, filename):
        assert(len(filename) != 0)
        self.filename = filename
        self.start = 0
        self.end = 0

    def open_file(self):
        f = None
        try:
            f = open(self.filename)
        except IOError as e:
            print 'Unable to open file:', e.message
        return f

    def count_orders_from(self, f, starting, ending, offset_list):
        f.seek(offset_list[starting])
        order_pattern = re.compile(r'.*(IN:)(\s)*(ORDER).*(ord_type)*')
        replace_pattern = re.compile(r'.*(IN:)(\s)*(REPLACE).*(ord_type)*')
        order_count=replace_count = 0
        for line in f:
            if order_pattern.match(line) != None:
                order_count+=1 # = order_count + 1
            if replace_pattern.match(line) != None:
                replace_count+=1 # = replace_count + 1
        #return (order_count, replace_count, order_count+replace_count)
        self.queue.put((order_count, replace_count, order_count+replace_count))

    def get_file_data(self):
        offset_list = []
        offset = 0
        num_lines = 0
        f = 0
        try:
            f = open(self.filename)
            for line in f:
                num_lines += 1
                offset_list.append(offset)
                offset += len(line)
            f.close()
        finally:
            f.close()
        return (num_lines, offset_list)

    def count_orders(self):
        self.start = time.clock()
        num_lines, offset_list = self.get_file_data()
        start_t1 = 0
        end_t1 = num_lines/2
        start_t2 = end_t1 + 1
        f = open(self.filename)
        t1 = self.FileParseThread(self.count_orders_from, f, start_t1, end_t1, offset_list)
        self.count_orders_from(f, start_t2, num_lines, offset_list)
        t1.start()
        self.end = time.clock()
        tup1 = self.queue.get()
        tup2 = self.queue.get()
        order_count1, replace_count1, sum1 = tup1
        order_count2, replace_count2, sum2 = tup2
        print 'Number of Orders (\'ORDER\'): {0}\n'\
        'Number of Replacements (\'REPLACE\'): {1}\n'\
        'Number of Orders and Replacements: {2}\n'\
        'Time to process: {3}\n'.format(order_count1+order_count2, \
                                        replace_count1+replace_count2, \
                                        sum1+sum2, \
                                        self.end - self.start)
        f.close()

def test2():
    p = PythonLogParser('../../20150708.aggregate.log')
    p.count_orders()

def main():
    test2()

main()

The idea is that since the file is large, each thread will read half the file. t1 reads the first half and the main thread reads the second. The main thread then adds together the results from both iterations and displays them.

My suspicion is that somehow the order_count and replace_count in count_orders_from are being modified between threads rather than starting at 0 for each thread, but I'm not sure since I don't see why separate calls to a method from 2 separate threads would modify the same variables.

Answer 1

The error was occurring because even though in theory the threads were parsing individual halves, what was in fact happening is that one thread parsed halfway and the other one parsed the full file, so items were double counted. This error was fixed by adding a linecount variable to count_orders_from, in order to check whether the reader has reached the line it is supposed to read to.

def count_orders_from(self, f, starting, ending, offset_list):
        f.seek(offset_list[starting])
        order_pattern = re.compile(r'.*(IN:)(\s)*(ORDER).*(ord_type)*')
        replace_pattern = re.compile(r'.*(IN:)(\s)*(REPLACE).*(ord_type)*')
        order_count=replace_count=linecount = 0
        for line in f:
            if order_pattern.match(line) != None:
                order_count+=1 # = order_count + 1
            if replace_pattern.match(line) != None:
                replace_count+=1 # = replace_count + 1
            if linecount==ending:
                break
            linecount+=1
        self.queue.put((order_count, replace_count, order_count+replace_count))

Output between single-threaded and multi-threaded versions of same application differs [Python]

Question

1 answers

solution1
0 2015-07-15 22:07:30

Output between single-threaded and multi-threaded versions of same application differs [Python]

Question

1 answers

solution1 0 2015-07-15 22:07:30

solution1
0 2015-07-15 22:07:30