简体   繁体   中英

Workers in javascript not so fast

I am giving a try to workers in js and I tried to make a simple sort using the same js sort function. The comparison i am making is just using an async function which will sort 60000 random numbers. The first will sort the random numbers as traditionally we are used to do it.

async function normalSort(arr) {
    return new Promise((res) => {
        let copy = arr;
        copy.sort((a, b) => a > b ? 1 : -1);
        return res(copy)
    })
}

the other is a normal function which will be called for a workersHandler function

const { Worker, parentPort, workerData } = require('worker_threads');

function sort(data) {
    let copy = data;
    copy.sort((a, b) => a > b ? 1 : -1);
    parentPort.postMessage(copy)
    process.exit();
}


sort(workerData); 

the workers handler function

const os = require('os');
const path = require('path');
const { Worker } = require('worker_threads');

async function workersHandler(arr) {
    const startTime = Date.now();
    const cpusAmount = os.cpus().length;
    const chSize = Math.ceil(arr.length / cpusAmount)
    let promises = [];
    for (let i = 0; i < arr.length; i += chSize) {
        const end = i + chSize;
        const currentChunk = arr.slice(i, end);
        const promise = new Promise((res, rej) => {
            //@ts-ignore
            const worker = new Worker(path.join(__dirname, '..', '/utils/sort.js'), { workerData: currentChunk })

            worker.on('message', res)
            worker.on('error', rej)
        })
        promises.push(promise);
    }
    let result = await Promise.all(promises)
    return result;
}

and the main function which will call the others functions

function main() {
    let arr = new Array(60000).fill(0).map((_, i) => Math.round(Math.random() * 100));
    const startTime = Date.now();

    workersHandler(arr).then(r => console.log('workers sort', Date.now() - startTime + ' ms'))
    normalSort(arr).then(r => console.log('normal sort', Date.now() - startTime + ' ms'))
}
main();

Surprisingly the normal sort function is way faster and is working in one thread. I am receiving for the workers function 101 ms for the normal sort function 53 ms Someone could explain me why these weird results?. Are workers not so fast or I am making a wrong implementation?.

Basically, using a single worker thread and waiting for it to do the work will always be slower than doing the work in the local thread, because:

  • Creating threads takes time.
  • Sending data between threads takes time.

Where you might get gains is if you have isolated pieces of work that can be handled in parallel, and multiple CPU cores to work with. In that situation, you can send different pieces of work to multiple workers (up to as many CPU cores as are available), provided the work isn't constrained by some other single resource they'd all be competing for.

Below I've posted a program that sorts 12 arrays locally and via workers with repeated races. (When sorting in workers, it transfers the array data to the worker and then back rather than copying it.) It starts the workers in advance and reuses them, and but it includes the time that took when determining the average time the workers took to do their work, so we're including all overhead.

On my workstation, with four CPU cores and letting it have a worker for each core, workers easily win:

# of workers:     4
Local average:    8790.010573029518ms
Workers' average: 3550.658817946911ms
Workers win, taking 40.39425% of the time local did

If I limit it to one worker, though, the worker is pure overhead and the local thread wins:

# of workers:     1
Local average:    8907.022233068943ms
Workers' average: 8953.339844942093ms
Local wins, taking 99.48268% of the time workers did

Even just two workers wins, because they can work in parallel on this multi-core machine:

# of workers:     2
Local average:    8782.853852927685ms
Workers' average: 4754.60275799036ms
Workers win, taking 54.13505% of the time local did

On a single core machine (if you can find one anymore), those two workers would be pure overhead again, and the local thread would win.

Here's main.js :

const os = require('os');
const { Worker } = require('worker_threads');
const { performance } = require('perf_hooks');

const MAX_UINT32 = (2**32)-1;
const ARRAY_SIZE = 100000;
const ARRAY_COUNT = 12;
const workerCount = +process.argv[2] || os.cpus().length;
const raceCount = +process.argv[3] || 5;

class WorkerQueue {
    #workers;
    #available;
    #pending;
    #checkPending = () => { // private methods still aren't unflagged in v13, so...
        if (this.#available.length && this.#pending.length) {
            const resolve = this.#pending.shift();
            const worker = this.#available.shift();
            resolve(worker);
        }
    };

    constructor(...workers) {
        this.#workers = new Set(workers);
        this.#available = [...this.#workers];
        this.#pending = [];
    }

    get() {
        return new Promise(resolve => {
            this.#pending.push(resolve);
            this.#checkPending();
        });
    }

    release(worker) {
        if (!this.#workers.has(worker)) {
            throw new Error("Uknown worker");
        }
        this.#available.push(worker);
        this.#checkPending();
    }

    terminate() {
        for (const worker of this.#workers) {
            worker.terminate();
        }
        this.#workers = new Set();
        this.#available = [];
        this.#pending = [];
    }
}

const {workers, workerCreationTime} = createWorkers();

main();

function createWorkers() {
    const start = performance.now();
    const workers = new WorkerQueue(
        ...Array.from({length: workerCount}, () => new Worker("./worker.js"))
    );
    const workerCreationTime = performance.now() - start;
    return {workers, workerCreationTime};
}

async function main() {
    try {
        console.log(`Workers: ${workerCount} (in ${workerCreationTime}ms), races: ${raceCount}`);
        let localAverage = 0;
        let workersAverage = 0;
        for (let n = 1; n <= raceCount; ++n) {
            console.log(`Race #${n}:`);
            const {localTime, workersTime} = await sortRace();
            localAverage += localTime;
            workersAverage += workersTime;
        }
        // Include the time it took to create the workers in the workers' average, as
        // though we'd created them for each race. (We didn't because doing so would
        // have given the local thread an advantage: after the first race, it's warmed
        // up, but a new worker would be cold. So we let the workers be warm but add
        // the full creation time into each race.
        workersAverage += workerCreationTime;
        console.log("----");
        console.log(`# of workers:     ${workerCount}`);
        console.log(`Local average:    ${localAverage}ms`);
        console.log(`Workers' average: ${workersAverage}ms`);
        if (localAverage > workersAverage) {
            showWinner("Workers win", "local", workersAverage, localAverage);
        } else {
            showWinner("Local wins", "workers", localAverage, workersAverage);
        }
        workers.terminate();
    } catch (e) {
        console.error(e.message, e.stack);
    }
}

function showWinner(msg, loser, winnerAverage, loserAverage) {
    const percentage = (winnerAverage * 100) / loserAverage;
    console.log(`${msg}, taking ${percentage.toFixed(5)}% of the time ${loser} did`);
}

async function sortRace() {
    // Create a bunch of arrays for local to sort
    const localArrays = Array.from({length: ARRAY_COUNT}, () => createRandomArray(ARRAY_SIZE));
    // Copy those array so the workers are dealing with the same values
    const workerArrays = localArrays.map(array => new Uint32Array(array));

    const localStart = performance.now();
    const localResults = await Promise.all(localArrays.map(sortLocal));
    const localTime = performance.now() - localStart;
    checkResults(localResults);
    console.log(`Local time:    ${localTime}ms`);

    const workerStart = performance.now();
    const workersResults = await Promise.all(workerArrays.map(sortViaWorker));
    const workersTime = performance.now() - workerStart;
    checkResults(workersResults);
    console.log(`Workers' time: ${workersTime}ms`);

    return {localTime, workersTime};
}

async function sortLocal(array) {
    await Promise.resolve(); // To make it start asynchronously, like `sortViaWorker` does
    array.sort((a, b) => a - b);
    return array;
}

async function sortViaWorker(array) {
    const worker = await workers.get();
    return new Promise(resolve => {
        worker.once("message", result => {
            workers.release(worker);
            resolve(result.array);
        });
        worker.postMessage({array}, [array.buffer]);
    });
}

function checkResults(arrays) {
    for (const array of arrays) {
        const badIndex = array.findIndex((value, index) => index > 0 && array[index-1] > value);
        if (badIndex !== -1) {
            throw new Error(
                `Error, array entry ${badIndex} has value ${array[badIndex]} ` +
                `which is > previous value ${array[badIndex-1]}`
            );
        }
    }
}

function createRandomArray(length) {
    const array = new Uint32Array(Uint32Array.BYTES_PER_ELEMENT * length);
    return randomFillArray(array);
}

function randomFillArray(array) {
    for (let length = array.length, i = 0; i < length; ++i) {
        array[i] = Math.random() * MAX_UINT32;
    }
    return array;
}

and worker.js :

const { parentPort } = require("worker_threads");

parentPort.on("message", ({array}) => {
    array.sort((a, b) => a - b);
    parentPort.postMessage({array}, [array.buffer]);
});

https://nodejs.org/api/worker_threads.html#worker_threads_port_postmessage_value_transferlist and https://developer.mozilla.org/en-US/docs/Web/API/Worker/postMessage :

postMessage(value[, transferList])
node : transferList may be a list of ArrayBuffer and MessagePort objects. After transferring, they will not be usable on the sending side of the channel anymore (even if they are not contained in value ). MDN : An optional array of Transferable objects to transfer ownership of. If the ownership of an object is transferred, it becomes unusable (neutered) in the context it was sent from and becomes available only to the worker it was sent to. Transferable objects are instances of classes like ArrayBuffer , MessagePort or ImageBitmap objects that can be transferred.

Effect of types:

 let typ=prompt("Type: 0/1/2/3 (Array/Float64Array/Float32Array/Uint32Array)"); let len=parseInt(prompt("Length")); let basearray; switch(typ){ case "1":basearray=new Float64Array(len);break; case "2":basearray=new Float32Array(len);break; case "3":basearray=new Uint32Array(len);break; default: basearray=new Array(len);break; } for(let i=0;i<basearray.length;i++) basearray[i]=Math.random()*0x1000000; let cpus=4, chunksize=basearray.length/cpus, chunks=[],chunksw=[]; for(let i=0;i<cpus;i++) chunksw[i]=(chunks[i]=basearray.slice(i*chunksize,(i+1)*chunksize)).slice(); let start=Date.now(); for(let i=0;i<cpus;i++) chunks[i].sort((a,b)=>ab); console.log("Seq:",Date.now()-start); let code="onmessage=event=>postMessage(event.data.sort((a,b)=>ab));"; let ws=[],cnt=0; for(let i=0;i<cpus;i++){ ws[i]=new Worker("data:text/plain,"+escape(code)); let j=i; ws[i].onmessage=event=>{ chunksw[j]=event.data; if(++cnt===cpus){ console.log("Par:",Date.now()-start); if(len<=20) for(let i=0;i<cpus;i++) console.log(chunks[i],chunksw[i]); } }; } start=Date.now(); for(let i=0;i<cpus;i++) ws[i].postMessage(chunksw[i]);

Specify a length divisible by 4. If length is 20 or less, the resulting sorted chunks are going to be logged too for verification purposes. JS Array -s are reliably slower for me when passed around (compared to the thread-less run), regardless of containing 20 or 6000000 elements (while a 6-million-element JS array runs for 8 seconds for me on an older laptop, it still may be safer to start with something less). The other types are faster when threaded, Uint being the fastest.
Actually anything which is not 1/2/3 is going to result in a JS Array (the slowest one), including the empty string.

Effect of transfer is not that spectacular, but already appears from the beginning (with 4 elements it is 59-69 ms vs 20-22 ms on my PC):

 let typ=prompt("Type: 0/1/2 (Float64Array/Float32Array/Uint32Array)"); let len=parseInt(prompt("Length")); let basearray; switch(typ){ case "1":basearray=new Float32Array(len);break; case "2":basearray=new Uint32Array(len);break; default:basearray=new Float64Array(len); } for(let i=0;i<basearray.length;i++) basearray[i]=Math.random()*0x1000000; let cpus=4, chunksize=basearray.length/cpus, chunksw=[],chunkswt=[]; for(let i=0;i<cpus;i++) chunkswt[i]=(chunksw[i]=basearray.slice(i*chunksize,(i+1)*chunksize)).slice(); let start; let code="onmessage=event=>postMessage(event.data.sort((a,b)=>ab));"; let ws=[],cnt=0; for(let i=0;i<cpus;i++){ ws[i]=new Worker("data:text/plain,"+escape(code)); let j=i; ws[i].onmessage=event=>{ chunksw[j]=event.data; if(++cnt===cpus){ console.log("Non-transfer:",Date.now()-start); // launch transfer measurement cnt=0;start=Date.now(); for(let i=0;i<cpus;i++) wst[i].postMessage(chunkswt[i].buffer,[chunkswt[i].buffer]); } }; } let codet; switch(typ){ case "1": codet="onmessage=event=>{"+ "let arr=new Float32Array(event.data);"+ "arr.sort((a,b)=>ab);"+ "postMessage(event.data,[event.data]);};"; break; case "2": codet="onmessage=event=>{"+ "let arr=new Uint32Array(event.data);"+ "arr.sort((a,b)=>ab);"+ "postMessage(event.data,[event.data]);};"; break; default: codet="onmessage=event=>{"+ "let arr=new Float64Array(event.data);"+ "arr.sort((a,b)=>ab);"+ "postMessage(event.data,[event.data]);};"; } let wst=[]; for(let i=0;i<cpus;i++){ wst[i]=new Worker("data:text/plain,"+escape(codet)); let j=i; wst[i].onmessage=event=>{ switch(typ){ case "1":chunkswt[j]=new Float32Array(event.data);break; case "2":chunkswt[j]=new Uint32Array(event.data);break; default:chunkswt[j]=new Float64Array(event.data); } if(++cnt===cpus){ console.log("Transfer:",Date.now()-start); if(len<=20) for(let i=0;i<cpus;i++) console.log(chunksw[i],chunkswt[i]); } }; } // launch non-transfer measurement start=Date.now(); for(let i=0;i<cpus;i++) ws[i].postMessage(chunksw[i]);

This code is a bit messy because it is the buffer which can be transferred, not the typed arrays themselves, and also, while the second measurement is initialized as a direct copy-paste (which already isn't that pretty), it is then launched from inside the completion function of the first one.

(I do not wish to provide exact measurement results because my PC is doing some other things too. Just run the snippets a couple times with varied or even repeated parameters)

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM