繁体   English   中英

如何从PHP生成的HTML页面获取正文内容?

[英]How to get the body content from a PHP-generated HTML page?

我正在尝试使用以下代码来获取HTML页面的内容:

String malSearch = "http://myanimelist.net/anime.php?letter=" + firstLetter;
URL url = new URL(malSearch);
URLConnection con = url.openConnection();
InputStream in = con.getInputStream();
String encoding = con.getContentEncoding();
encoding = encoding == null ? "UTF-8" : encoding;
ByteArrayOutputStream baos = new ByteArrayOutputStream();
byte[] buf = new byte[8192];
int len = 0;
while ((len = in.read(buf)) != -1) {
    baos.write(buf, 0, len);
}
String body = new String(baos.toByteArray(), encoding);

它可以正常工作,但不能满足我的实际需求。 它给了我这个:

<html>
 <head>
  <META NAME="ROBOTS" CONTENT="NOINDEX, NOFOLLOW">
  <meta name="format-detection" content="telephone=no">
  <meta name="viewport" content="initial-scale=1.0">
  <meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1">
 </head>
 <body style="margin:0px">
  <iframe src="/_Incapsula_Resource?CWUDNSAI=9&xinfo=6-122029399-0 0NNN RT(1404149034204 2) q(0 -1 -1 -1) r(0 -1) B12(4,315,0) U1&incident_id=124001330081285077-564449081699338326&edet=12&cinfo=4ee46646c753833e04000000" frameborder=0 width="100%" height="100%" marginheight="0px" marginwidth="0px">Request unsuccessful. Incapsula incident ID: 124001330081285077-564449081699338326</iframe>
 </body>
</html>

应该给我整页的时间(大约800行)。

我认为这是由于这是一个使用PHP的网站,但我不确定。 有人可以告诉我如何获取整个HTML内容吗?

这是我要从中获取内容的页面: http : //myanimelist.net/anime.php?letter=A

该站点使用名为Incapsula的服务。 网站管理员将Incapsula配置为防止漫游器访问其内容。

我建议您联系管理员并要求将其列入白名单,尝试绕过系统可能会使您被禁止进入黑名单。

这是因为您从服务器获取的HTML包含应加载的其他页面的地址。 您的客户应该获取其他页面,如果您需要该页面,则可以检索其他页面,就像在网络浏览器中显示的一样。

站点使用什么都没有关系。

编辑:仔细阅读错误消息,您会发现此错误:

Request unsuccessful. Incapsula incident ID: 124001330081285077-

因此,您的通话出现问题,或者服务器不喜欢您的代理(正在使用IE或Chrome这样的浏览器)

是的,您的通话有问题。 我能够调用该页面并获得没有错误的响应:

<html>
<head>
<META NAME="robots" CONTENT="noindex,nofollow">
<script>
(function(){function getSessionCookies(){cookieArray=new Array();var cName=/^\s?incap_ses_/;var c=document.cookie.split(";");for(var i=0;i<c.length;i++){key=c[i].substr(0,c[i].indexOf("="));value=c[i].substr(c[i].indexOf("=")+1,c[i].length);if(cName.test(key)){cookieArray[cookieArray.length]=value}}return cookieArray}function setIncapCookie(vArray){try{cookies=getSessionCookies();digests=new Array(cookies.length);for(var i=0;i<cookies.length;i++){digests[i]=simpleDigest((vArray)+cookies[i])}res=vArray+",digest="+(digests.join())}catch(e){res=vArray+",digest="+(encodeURIComponent(e.toString()))}createCookie("___utmvc",res,20)}function simpleDigest(mystr){var res=0;for(var i=0;i<mystr.length;i++){res+=mystr.charCodeAt(i)}return res}function createCookie(name,value,seconds){if(seconds){var date=new Date();date.setTime(date.getTime()+(seconds*1000));var expires="; expires="+date.toGMTString()}else{var expires=""}document.cookie=name+"="+value+expires+"; path=/"}function test(o){var res="";var vArray=new Array();for(test in o){switch(o[test]){case"exists":try{vArray[vArray.length]=encodeURIComponent(test+"="+typeof(eval(test)))}catch(e){vArray[vArray.length]=encodeURIComponent(test+"="+e)}break;case"value":try{vArray[vArray.length]=encodeURIComponent(test+"="+eval(test).toString())}catch(e){vArray[vArray.length]=encodeURIComponent(test+"="+e)}break;case"plugins":try{p=navigator.plugins;pres="";for(a in p){pres+=(p[a]["description"]+" ").substring(0,20)}vArray[vArray.length]=encodeURIComponent("plugins="+pres)}catch(e){vArray[vArray.length]=encodeURIComponent("plugins="+e)}break;case"plugin":try{a=navigator.plugins;for(i in a){f=a[i]["filename"].split(".");if(f.length==2){vArray[vArray.length]=encodeURIComponent("plugin="+f[1]);break}}}catch(e){vArray[vArray.length]=encodeURIComponent("plugin="+e)}break}}vArray=vArray.join();return vArray}var o={navigator:"exists","navigator.vendor":"value",opera:"exists",ActiveXObject:"exists","navigator.appName":"value",platform:"plugin",webkitURL:"exists","navigator.plugins.length==0":"value"};try{setIncapCookie(test(o));document.createElement("img").src="/_Incapsula_Resource?SWKMTFSR=1&e="+Math.random()}catch(e){img=document.createElement("img");img.src="/_Incapsula_Resource?SWKMTFSR=1&e="+e}})();
</script>
<script>
(function() { 
var z="";var b="7472797B766172207868723B76617220743D6E6577204461746528292E67657454696D6528293B766172207374617475733D227374617274223B7661722074696D696E673D6E65772041727261792833293B77696E646F772E6F6E756E6C6F61643D66756E6374696F6E28297B74696D696E675B325D3D22723A222B286E6577204461746528292E67657454696D6528292D74293B646F63756D656E742E637265617465456C656D656E742822696D6722292E7372633D222F5F496E63617073756C615F5265736F757263653F4553324C555243543D363726743D373826643D222B656E636F6465555249436F6D706F6E656E74287374617475732B222028222B74696D696E672E6A6F696E28292B222922297D3B69662877696E646F772E584D4C4874747052657175657374297B7868723D6E657720584D4C48747470526571756573747D656C73657B7868723D6E657720416374697665584F626A65637428224D6963726F736F66742E584D4C4854545022297D7868722E6F6E726561647973746174656368616E67653D66756E6374696F6E28297B737769746368287868722E72656164795374617465297B6361736520303A7374617475733D6E6577204461746528292E67657454696D6528292D742B223A2072657175657374206E6F7420696E697469616C697A656420223B627265616B3B6361736520313A7374617475733D6E6577204461746528292E67657454696D6528292D742B223A2073657276657220636F6E6E656374696F6E2065737461626C6973686564223B627265616B3B6361736520323A7374617475733D6E6577204461746528292E67657454696D6528292D742B223A2072657175657374207265636569766564223B627265616B3B6361736520333A7374617475733D6E6577204461746528292E67657454696D6528292D742B223A2070726F63657373696E672072657175657374223B627265616B3B6361736520343A7374617475733D22636F6D706C657465223B74696D696E675B315D3D22633A222B286E6577204461746528292E67657454696D6528292D74293B6966287868722E7374617475733D3D323030297B706172656E742E6C6F636174696F6E2E72656C6F616428297D627265616B7D7D3B74696D696E675B305D3D22733A222B286E6577204461746528292E67657454696D6528292D74293B7868722E6F70656E2822474554222C222F5F496E63617073756C615F5265736F757263653F535748414E45444C3D353938343034363637363030353035323533302C343230323939303534303139393036353232332C353135373831353236363332383535313738342C323633333631222C66616C7365293B7868722E73656E64286E756C6C297D63617463682863297B7374617475732B3D6E6577204461746528292E67657454696D6528292D742B2220696E6361705F6578633A20222B633B646F63756D656E742E637265617465456C656D656E742822696D6722292E7372633D222F5F496E63617073756C615F5265736F757263653F4553324C555243543D363726743D373826643D222B656E636F6465555249436F6D706F6E656E74287374617475732B222028222B74696D696E672E6A6F696E28292B222922297D3B";for (var i=0;i<b.length;i+=2){z=z+parseInt(b.substring(i, i+2), 16)+",";}z = z.substring(0,z.length-1); eval(eval('String.fromCharCode('+z+')'));})();
</script></head>
<body>
<iframe style="display:none;visibility:hidden;" src="http://content.incapsula.com/jsTest.html" id="gaIframe"></iframe>
</body></html>

该页面上的主要内容已加载到<iframe>标记中。 在该标签中,您可以看到内容网址。

String malSearch = "http://myanimelist.net//_Incapsula_Resource?CWUDNSAI=9&xinfo=6-122029399-0 0NNN RT(1404149034204 2) q(0 -1 -1 -1) r(0 -1) B12(4,315,0) U1&incident_id=124001330081285077-564449081699338326&edet=12&cinfo=4ee46646c753833e04000000";
URL url = new URL(malSearch);
URLConnection con = url.openConnection();
InputStream in = con.getInputStream();

暂无
暂无

声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM