简体   繁体   中英

Extract images from PDF file with jspdf or other npm package

Good day. Is there any way to extract image from pdf file using jspdf library or other npm libraries. Maybe there are some solutions for react?

I will be grateful for help.

I got all images from pdf file using 'pdf-lib' and 'stream-mime-type' libraries. Here is the function itself, I hope it will be useful.

const {PDFDocument} = require('pdf-lib');
const fs = require('fs')
const {getMimeType} = require('stream-mime-type')
const createJpgFile = require('./utils/createJpgFile.js')
const path = require("path");

async function getImageFromPdf() {
  const existingPdfBytes = await new Promise((resolve, reject) => {
    fs.readFile(path.join(rootPath, 'test.pdf'), (err, result) => {
      if (err) {
        reject(err)
      }
      if (!err) {
        resolve(result)
     }
   })
  })
  const pdfDoc = await PDFDocument.load(existingPdfBytes)
  const pages = pdfDoc.getPages()
   const result = []
  pages[0].doc.context.indirectObjects.forEach(el => {
     if (el.hasOwnProperty('contents')) result.push(el.contents)
   })
  const mime = await Promise.all(result.map(async (el) => {
     return new Promise(async (resolve) => {
       const res = await getMimeType(el)
       if (res) {
         resolve(res)
       }
     })
   }));
   await Promise.all(mime.map(async (el, i) => {
       if (el.mime === 'image/jpeg') {
         return new Promise(async (resolve) => {
           const res = await writeJpgFile(result[i], `image-${i}`, 
 'jpg')
          resolve(res)
         })
       }
     })
   )
 }

add: I also wrote a function that parses jpg images from a pdf file without using third-party libraries. The function finds the beginning of jpg images by signature. For other formats, you need to use other signatures.

const parserJpegFromPdf = async () => {
  const existingPdfBytes = await getArrayBufferFromPdf()
  const convertedBuffer = new Uint8Array(existingPdfBytes)
  const firstBeginSignatureSymbol = parseInt('ff', 16)
  const secondBeginSignatureSymbol = parseInt('d8', 16)
  const indexesStartSignatureImage = []
  convertedBuffer.forEach((el, i) => {
    if (el === firstBeginSignatureSymbol &&
      convertedBuffer[i + 1] === secondBeginSignatureSymbol &&
      convertedBuffer[i + 2] === firstBeginSignatureSymbol &&
      (convertedBuffer[i + 3] === parseInt('e0', 16) ||
      convertedBuffer[i + 3] === parseInt('e1', 16) ||
      convertedBuffer[i + 3] === parseInt('e2', 16) ||
      convertedBuffer[i + 3] === parseInt('e3', 16) ||
      convertedBuffer[i + 3] === parseInt('e8', 16))) {
        indexesStartSignatureImage.push(i)
    }
  })
  const resultSlicedCodeImage = indexesStartSignatureImage.reduce((arr, 
el) => {
    arr.push(convertedBuffer.slice(el));
    return arr
  }, [])
  await Promise.all(resultSlicedCodeImage.map(async (el, i) => {
    await createFile(el, `test_image_${i}`, 'jpeg')
  }))
}

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM