简体   繁体   中英

Convert PDF to HTML (XML like sturcture ) using pdfjs

I found this code which show PDF in HTML (not image). Only Once i can get all pages and i don't know what i did back then and nothing after that work . Can't Convert last page of pdf to xml like html. Focus it just get structure of data where data can retrieve.

I can't get last page result here in log. So if there is 1 page PDF don't convert . I like to convert data from PDF to HTML(XML) like structure.

 var __PDF_DOC, __CURRENT_PAGE, __TOTAL_PAGES, __PAGE_RENDERING_IN_PROGRESS = 0, __CANVAS = $('#pdf-canvas').get(0), _x ="", __o=0, __CANVAS_CTX = __CANVAS.getContext('2d'); function showPDF(pdf_url) { $("#pdf-loader").show(); PDFJS.getDocument({ url: pdf_url }).then(function(pdf_doc) { __PDF_DOC = pdf_doc; __TOTAL_PAGES = __PDF_DOC.numPages; // Hide the pdf loader and show pdf container in HTML $("#pdf-loader").hide(); $("#pdf-contents").show(); $("#pdf-total-pages").text(__TOTAL_PAGES); // Show the first page showPage(1); }).catch(function(error) { // If error re-show the upload button $("#pdf-loader").hide(); $("#upload-button").show(); alert(error.message); });; } function showPage(page_no) { __PAGE_RENDERING_IN_PROGRESS = 1; __CURRENT_PAGE = page_no; // Disable Prev & Next buttons while page is being loaded $("#pdf-next, #pdf-prev").attr('disabled', 'disabled'); // While page is being rendered hide the canvas and show a loading message $("#pdf-canvas").hide(); $("#page-loader").show(); // Update current page in HTML $("#pdf-current-page").text(page_no); // Fetch the page __PDF_DOC.getPage(page_no).then(function(page) { // As the canvas is of a fixed width we need to set the scale of the viewport accordingly var scale_required = __CANVAS.width / page.getViewport(1).width; // Get viewport of the page at required scale var viewport = page.getViewport(scale_required); // Set canvas height __CANVAS.height = viewport.height; var renderContext = { canvasContext: __CANVAS_CTX, viewport: viewport }; // Render the page contents in the canvas page.render(renderContext).then(function() { __PAGE_RENDERING_IN_PROGRESS = 0; // Re-enable Prev & Next buttons $("#pdf-next, #pdf-prev").removeAttr('disabled'); // Show the canvas and hide the page loader $("#pdf-canvas").show(); $("#page-loader").hide(); // Return the text contents of the page after the pdf has been rendered in the canvas return page.getTextContent(); }).then(function(textContent) { // Get canvas offset var canvas_offset = $("#pdf-canvas").offset(); // Clear HTML for text layer sor(); $("#text-layer").html(''); // Assign the CSS created to the text-layer element $("#text-layer").css({ left: canvas_offset.left + 'px', top: canvas_offset.top + 'px', height: __CANVAS.height + 'px', width: __CANVAS.width + 'px' }); // Pass the data to the method for rendering of text over the pdf canvas. PDFJS.renderTextLayer({ textContent: textContent, container: $("#text-layer").get(0), viewport: viewport, textDivs: [] }); }); }); } // Upon click this should should trigger click on the #file-to-upload file input element // This is better than showing the not-good-looking file input element $("#upload-button").on('click', function() { $("#pdf-main-container").show(); var _x =""; $("#file-to-upload").trigger('click'); }); // When user chooses a PDF file $("#file-to-upload").on('change', function() { // Validate whether PDF if(['application/pdf'].indexOf($("#file-to-upload").get(0).files[0].type) == -1) { alert('Error : Not a PDF'); return; } $("#upload-button").hide(); //_x =" "; //__CURRENT_PAGE = undefined; // Send the object url of the pdf showPDF(URL.createObjectURL($("#file-to-upload").get(0).files[0])); }); // Previous page of the PDF $("#pdf-prev").on('click', function() { if(__CURRENT_PAGE != 1) showPage(--__CURRENT_PAGE); }); // Next page of the PDF $("#pdf-next").on('click', function() { if(__CURRENT_PAGE != __TOTAL_PAGES) showPage(++__CURRENT_PAGE); }); function sor() { ++__o; if(__o==__CURRENT_PAGE){ _x = _x + $("#text-layer").html(); } if((__CURRENT_PAGE != __TOTAL_PAGES) && __o == __CURRENT_PAGE){ showPage(++__CURRENT_PAGE); } else { _x = _x + $("#text-layer").html(); var _y= _x.replace(/(style="([^>]+)")/gi,""); $("#text-layer,#pdf-canvas").html(" "); $("#upload-button").show(); $("#pdf-main-container").show(); console.log(_y); } }
 #upload-button { width: 150px; display: block; margin: 20px auto; } #file-to-upload { display: none; } #pdf-main-container { width: 400px; margin: 20px auto; } #pdf-loader { display: none; text-align: center; color: #999999; font-size: 13px; line-height: 100px; height: 100px; } #pdf-contents { display: none; } #pdf-meta { overflow: hidden; margin: 0 0 20px 0; z-index: 2; position: relative; } #pdf-buttons { float: left; } #page-count-container { float: right; } #pdf-current-page { display: inline; } #pdf-total-pages { display: inline; } #pdf-canvas { border: 1px solid rgba(0,0,0,0.2); box-sizing: border-box; } #page-loader { height: 100px; line-height: 100px; text-align: center; display: none; color: #999999; font-size: 13px; } #text-layer { position: absolute; left: 0; top: 0; right: 0; bottom: 0; overflow: hidden; opacity: 0.2; line-height: 1.0; } #text-layer > div { color: transparent; position: absolute; white-space: pre; cursor: text; transform-origin: 0% 0%; }
 <!DOCTYPE html> <html> <head> <meta http-equiv="Content-Type" content="text/html; charset=UTF-8" /> <meta name="viewport" content="width=device-width, initial-scale = 1.0, maximum-scale = 1.0, user-scalable=no"> <script src="https://ajax.googleapis.com/ajax/libs/jquery/2.2.4/jquery.min.js"></script> <script src="https://intaxing.in/js/pdf.js"></script> <script src="https://intaxing.in/js/pdf.worker.js"></script> </head> <body> <button id="upload-button">Select PDF</button> <input type="file" id="file-to-upload" accept="application/pdf" /> <div id="pdf-main-container"> <div id="pdf-loader">Loading document ...</div> <div id="pdf-contents"> <div id="pdf-meta"> <div id="pdf-buttons"> <button id="pdf-prev">Previous</button> <button id="pdf-next">Next</button> </div> <div id="page-count-container">Page <div id="pdf-current-page"></div> of <div id="pdf-total-pages"></div></div> </div> <canvas id="pdf-canvas" width="400"></canvas> <div id="text-layer"></div> <div id="page-loader">Loading page ...</div> </div> </div> </body> </html>

I found this code which show PDF in HTML (not image). Only Once i can get all pages and i don't know what i did back then and nothing after that work . Can't Convert last page of pdf to xml like html. Focus it just get structure of data where data can retrieve.

I can't get last page result here in log. So if there is 1 page PDF don't convert . I like to convert data from PDF to HTML(XML) like structure.

 var __PDF_DOC, __CURRENT_PAGE, __TOTAL_PAGES, __PAGE_RENDERING_IN_PROGRESS = 0, __CANVAS = $('#pdf-canvas').get(0), _x ="", __o=0, __CANVAS_CTX = __CANVAS.getContext('2d'); function showPDF(pdf_url) { $("#pdf-loader").show(); PDFJS.getDocument({ url: pdf_url }).then(function(pdf_doc) { __PDF_DOC = pdf_doc; __TOTAL_PAGES = __PDF_DOC.numPages; // Hide the pdf loader and show pdf container in HTML $("#pdf-loader").hide(); $("#pdf-contents").show(); $("#pdf-total-pages").text(__TOTAL_PAGES); // Show the first page showPage(1); }).catch(function(error) { // If error re-show the upload button $("#pdf-loader").hide(); $("#upload-button").show(); alert(error.message); });; } function showPage(page_no) { __PAGE_RENDERING_IN_PROGRESS = 1; __CURRENT_PAGE = page_no; // Disable Prev & Next buttons while page is being loaded $("#pdf-next, #pdf-prev").attr('disabled', 'disabled'); // While page is being rendered hide the canvas and show a loading message $("#pdf-canvas").hide(); $("#page-loader").show(); // Update current page in HTML $("#pdf-current-page").text(page_no); // Fetch the page __PDF_DOC.getPage(page_no).then(function(page) { // As the canvas is of a fixed width we need to set the scale of the viewport accordingly var scale_required = __CANVAS.width / page.getViewport(1).width; // Get viewport of the page at required scale var viewport = page.getViewport(scale_required); // Set canvas height __CANVAS.height = viewport.height; var renderContext = { canvasContext: __CANVAS_CTX, viewport: viewport }; // Render the page contents in the canvas page.render(renderContext).then(function() { __PAGE_RENDERING_IN_PROGRESS = 0; // Re-enable Prev & Next buttons $("#pdf-next, #pdf-prev").removeAttr('disabled'); // Show the canvas and hide the page loader $("#pdf-canvas").show(); $("#page-loader").hide(); // Return the text contents of the page after the pdf has been rendered in the canvas return page.getTextContent(); }).then(function(textContent) { // Get canvas offset var canvas_offset = $("#pdf-canvas").offset(); // Clear HTML for text layer sor(); $("#text-layer").html(''); // Assign the CSS created to the text-layer element $("#text-layer").css({ left: canvas_offset.left + 'px', top: canvas_offset.top + 'px', height: __CANVAS.height + 'px', width: __CANVAS.width + 'px' }); // Pass the data to the method for rendering of text over the pdf canvas. PDFJS.renderTextLayer({ textContent: textContent, container: $("#text-layer").get(0), viewport: viewport, textDivs: [] }); }); }); } // Upon click this should should trigger click on the #file-to-upload file input element // This is better than showing the not-good-looking file input element $("#upload-button").on('click', function() { $("#pdf-main-container").show(); var _x =""; $("#file-to-upload").trigger('click'); }); // When user chooses a PDF file $("#file-to-upload").on('change', function() { // Validate whether PDF if(['application/pdf'].indexOf($("#file-to-upload").get(0).files[0].type) == -1) { alert('Error : Not a PDF'); return; } $("#upload-button").hide(); //_x =" "; //__CURRENT_PAGE = undefined; // Send the object url of the pdf showPDF(URL.createObjectURL($("#file-to-upload").get(0).files[0])); }); // Previous page of the PDF $("#pdf-prev").on('click', function() { if(__CURRENT_PAGE != 1) showPage(--__CURRENT_PAGE); }); // Next page of the PDF $("#pdf-next").on('click', function() { if(__CURRENT_PAGE != __TOTAL_PAGES) showPage(++__CURRENT_PAGE); }); function sor() { ++__o; if(__o==__CURRENT_PAGE){ _x = _x + $("#text-layer").html(); } if((__CURRENT_PAGE != __TOTAL_PAGES) && __o == __CURRENT_PAGE){ showPage(++__CURRENT_PAGE); } else { _x = _x + $("#text-layer").html(); var _y= _x.replace(/(style="([^>]+)")/gi,""); $("#text-layer,#pdf-canvas").html(" "); $("#upload-button").show(); $("#pdf-main-container").show(); console.log(_y); } }
 #upload-button { width: 150px; display: block; margin: 20px auto; } #file-to-upload { display: none; } #pdf-main-container { width: 400px; margin: 20px auto; } #pdf-loader { display: none; text-align: center; color: #999999; font-size: 13px; line-height: 100px; height: 100px; } #pdf-contents { display: none; } #pdf-meta { overflow: hidden; margin: 0 0 20px 0; z-index: 2; position: relative; } #pdf-buttons { float: left; } #page-count-container { float: right; } #pdf-current-page { display: inline; } #pdf-total-pages { display: inline; } #pdf-canvas { border: 1px solid rgba(0,0,0,0.2); box-sizing: border-box; } #page-loader { height: 100px; line-height: 100px; text-align: center; display: none; color: #999999; font-size: 13px; } #text-layer { position: absolute; left: 0; top: 0; right: 0; bottom: 0; overflow: hidden; opacity: 0.2; line-height: 1.0; } #text-layer > div { color: transparent; position: absolute; white-space: pre; cursor: text; transform-origin: 0% 0%; }
 <!DOCTYPE html> <html> <head> <meta http-equiv="Content-Type" content="text/html; charset=UTF-8" /> <meta name="viewport" content="width=device-width, initial-scale = 1.0, maximum-scale = 1.0, user-scalable=no"> <script src="https://ajax.googleapis.com/ajax/libs/jquery/2.2.4/jquery.min.js"></script> <script src="https://intaxing.in/js/pdf.js"></script> <script src="https://intaxing.in/js/pdf.worker.js"></script> </head> <body> <button id="upload-button">Select PDF</button> <input type="file" id="file-to-upload" accept="application/pdf" /> <div id="pdf-main-container"> <div id="pdf-loader">Loading document ...</div> <div id="pdf-contents"> <div id="pdf-meta"> <div id="pdf-buttons"> <button id="pdf-prev">Previous</button> <button id="pdf-next">Next</button> </div> <div id="page-count-container">Page <div id="pdf-current-page"></div> of <div id="pdf-total-pages"></div></div> </div> <canvas id="pdf-canvas" width="400"></canvas> <div id="text-layer"></div> <div id="page-loader">Loading page ...</div> </div> </div> </body> </html>

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM