简体   繁体   中英

Get pdf buffer with puppeteer with session generated url

I've been using puppeteer to try and get pdfs - or its buffer response - from a website which does two requests after clicking on the link for the document (which open in a new tab):

  1. The first request ( http://epicdocs.planningni.gov.uk/ViewDocument.pa?uri=4157826&ext=PDF ) retrieves the session guid to access the document
  2. The second request ( http://epicdocs.planningni.gov.uk/ViewDocument.aspx?guid=4ecd1fe5-43c6-4202-96e3-66b393fb819c ) uses that guid to access the document and render the pdf on the browser.

The result of my attempts has been a blank pdf being generated, even if it was created after the page been loaded (checked with Fiddler).

I've tried

  • Intercepting targetcreated event to get the page
  • Get the second request url and use page.goto to get the pdf
  • Wait on a the page response to get the buffer
  • Set Page.setDownloadBehaviour to allow download instead of rendering it in the browser

Any guidance and help is appreciated. The code tried is below:

const puppeteer = require("puppeteer");

let browser;

async function getDocument(index, title, page) {
  if (index != 19) return "";
  console.log("getDocument START");
  console.log("#repDocuments__ctl" + index + "_lnkViewDoc\ntitle: " + title);
  let docPagePromise = new Promise((resolve, reject) =>
    browser.once("targetcreated", async target => {
      let targetUrl = await target.url();
      if (targetUrl.indexOf("ViewDocument.aspx?") !== -1) {
        console.log(targetUrl);
        return resolve(target.page());
      } else {
        console.log("Failed to detect the ViewDocument page");
      }
    })
  );

  /* Tried to set the download behaviour to download automatically the pdf but it didn't work */
  // await page._client.send("Page.setDownloadBehaviour", {
  //   behaviour: "allow",
  //   downloadPath: "./"
  // });
  await page.click(`#repDocuments__ctl${index}_lnkViewDoc`);
  let pdfResults = "";
  let pdfPage = await docPagePromise;

  /* If I get the target from the page returned from the promise I get the correct ur, however the page url is blank */
  // let target = await pdfPage.target();
  // let url = await target.url();
  // let response = await pdfPage.goto(url);
  // console.log(response);
  pdfPage.on("console.log", msg => console.log(msg));

  /* This is never called */
  await pdfPage.on("response", async response => {
    console.log("PDF PAGE Response");
    let responseBuffer = await response.buffer();
    let responseHeaders = response.headers();
    console.log("PDF PAGE Response Header: " + responseHeaders);
    console.log("PDF PAGE Response Buffer: " + responseBuffer);
    return {
      responseHeaders,
      responseBuffer
    };
  });
  console.log(pdfResults);

  let pdfTitle = await pdfPage.title();
  console.log("PDFPage URL: " + pdfPage.url());
  console.log("PDFPage Title: " + pdfTitle);

  let pdfTarget = await pdfPage.target();
  console.log("PDFTarget URL: " + (await pdfTarget.url()));
  console.log("PDFTarget Type: " + pdfTarget.type());
  pdfPage = await pdfTarget.page();
  console.log("PDFPage URL: " + pdfPage.url());

  await pdfPage.waitFor(3000);
  let pdf = await pdfPage.pdf({ path: title + ".pdf" });
  console.log(pdf);
  return pdf;
}

async function getAdditionalDocumentation(page) {
  console.log("getAdditionalDocumentation START");

  await page.waitForSelector("#repGroupSummary__ctl1_lnkGroupName");
  await page.click("#repGroupSummary__ctl1_lnkGroupName");
  await page.waitForSelector("#pnlDocumentList > table > tbody > tr");

  await page.waitFor(2000);

  const documents = await page.$$eval(
    "#pnlDocumentList > table > tbody > tr",
    docs =>
      docs.map((doc, i) => ({
        type: doc.querySelector(".tdl-subgroup > span").innerText,
        datePublished: doc.querySelector(
          ".tdl-date > span[id*='DatePublished']"
        ).innerText,
        dateReceived: doc.querySelector(".tdl-date > span[id*='DateReceived']")
          .innerText,
        docType: doc.querySelector(".tdl-doctype > span").innerText,
        description: doc.querySelector(".tdl-description > span").innerText
        // 'docBuffer': window.getDocument(i + 1, doc.querySelector('.tdl-description > span').innerText)
      }))
  );

  for (let i = 0; i < documents.length; i++) {
    documents[i].docBuffer = await getDocument(i + 1, documents[i].description, page);
  }

  await page.click("#btnSummary");
  console.log("getAdditionalDocumentation FINISH");

  return documents;
}

async function getDocuments(page, browser) {
  console.log("getDocuments");
  let newPagePromise = new Promise((resolve, reject) =>
    browser.once("targetcreated", async target => {
      let targetUrl = await target.url();
      if (targetUrl.indexOf("ShowCaseFile.aspx?") !== -1) {
        console.log(targetUrl);
        return resolve(target.page());
      } else {
        console.log("Failed to detect the ShowCaseFile page");
      }
    })
  );
  await page.click("#tab_externalDocuments > span");
  await page.waitForSelector("#hp-doc-link");

  await page.click("#hp-doc-link");
  const newPage = await newPagePromise;

  const additionalDocumentation = await getAdditionalDocumentation(newPage);

  return {
    additionalDocumentation
  };
}




async function run() {
  try {
    browser = await puppeteer.launch();
    const page = await browser.newPage();

    page.on("console", msg => console.log("PAGE LOG:", ...msg.args));

    const planningReference = "LA04/2017/1388/F";
    await page.goto(
      "http://epicpublic.planningni.gov.uk/publicaccess/search.do?action=simple&searchType=Application"
    );
    await page.waitForSelector("#simpleSearchString");
    await page.type("#simpleSearchString", planningReference);
    await page.click("#simpleSearchForm > div.row3 > input.button.primary");

    await page.waitForSelector("#simpleDetailsTable");

    console.log("getDocuments START");
    const documents = await getDocuments(page, browser);
    console.log("getDocuments FINISH");

    console.log(documents);
    console.log(documents.additionalDocumentation.length);
  } finally {
    browser.close();
  }
}

run();

Use exposefunction to write the buffer data to disk with:

page.exposeFunction("writeABString", async (strbuf, targetFile) => {
    var str2ab = function _str2ab(str) { // Convert a UTF-8 String to an ArrayBuffer

        var buf = new ArrayBuffer(str.length); // 1 byte for each char
        var bufView = new Uint8Array(buf);

        for (var i=0, strLen=str.length; i < strLen; i++) {
          bufView[i] = str.charCodeAt(i);
        }
        return buf;
    }

    console.log("In 'writeABString' function...");

    return new Promise((resolve, reject) => {

        // Convert the ArrayBuffer string back to an ArrayBufffer, which in turn is converted to a Buffer
        let buf = Buffer.from(str2ab(strbuf));

        // Try saving the file.        
        fs.writeFile(targetFile, buf, (err, text) => {
            if(err) reject(err);
            else resolve(targetFile);
        });
    });
});

With the download link that you have use it in tandem with fetch api to get it as blob and convert it with:

page.evaluate( async () => {

function arrayBufferToString(buffer){ // Convert an ArrayBuffer to an UTF-8 String

    var bufView = new Uint8Array(buffer);
    var length = bufView.length;
    var result = '';
    var addition = Math.pow(2,8)-1;

    for(var i = 0;i<length;i+=addition){
        if(i + addition > length){
            addition = length - i;
        }
        result += String.fromCharCode.apply(null, bufView.subarray(i,i+addition));
    }
    return result;
}

   let geturl = "https://whateverurl.example.com";

   return fetch(geturl, {
        credentials: 'same-origin', // usefull when we are logged into a website and want to send cookies
        responseType: 'arraybuffer', // get response as an ArrayBuffer
   })
   .then(response => response.arrayBuffer())
   .then( arrayBuffer => {
        var bufstring = arrayBufferToString(arrayBuffer);
        return window.writeABString(bufstring, '/tmp/downloadtest.pdf');
   })
   .catch(function (error) {
        console.log('Request failed: ', error);
   });
 });

For more info look at this issue on the github puppeteer page. The above solution was also suggested in the issue. Source

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM