简体   繁体   中英

How to get the raw data from pdf.js

I am building a page which is using PDF.js to load and render a pdf as the following code.

var url = '/path-to-pdf.js';
PDFJS.workerSrc = "./js/external/pdf.worker.js";

PDFJS.getDocument(url).then(function getPdfHelloWorld(pdf) {

    var pageNumber = 1;
    renderPage($(".center-info")[0], pdf, 1, function pageRenderingComplete() {
        if (pageNumber > pdf.numPages) {
            return; // All pages rendered
        }
        // Continue rendering of the next page
        renderPage($("display-div")[0], pdf, ++pageNumber, pageRenderingComplete);
    });

});

I would like to make client-side download, which means I have to access the raw PDF directly. Is it possible to do that here?

I just got the answer. We can access the data by getData() method.

PDFJS.getDocument(url).then(function getPdfHelloWorld(pdf) {

    pdf.getData().then(function(arrayBuffer) {
        var pdfraw = String.fromCharCode.apply(null, arrayBuffer);

        // Operation your raw pdf here...
    });

Cheers

async function extract(input) {
    const pdf = await pdfJS.getDocument(input);

    const elements = [];

    for (let pageNumber = 1; pageNumber <= pdf.numPages; pageNumber++) {
        const page = await pdf.getPage(pageNumber);
        const textContent = await page.getTextContent({
            normalizeWhitespace: true,
            disableCombineTextItems: false,
        });

        textContent.items.forEach(item => {
            elements.push(item);
        });
    }

    return elements;
}

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM