簡體   English   中英

使用 java 解析 xml 並保留 html 標簽

[英]Parse xml using java and keep html tags

我有一個 xml 我解析並從節點之間獲取數據。 然而,這個數據被 html 標簽包圍。 我創建另一個 xml 並將這些數據放入其中。 現在我必須再次解析它以獲得正確的 html 語法。

請幫忙。

public class XMLfunctions {

public final static Document XMLfromString(String xml){

    Document doc = null;

    DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
    try {

        DocumentBuilder db = dbf.newDocumentBuilder();

        InputSource is = new InputSource();
        is.setCharacterStream(new StringReader(xml));
        doc = db.parse(is); 

    } catch (ParserConfigurationException e) {
        System.out.println("XML parse error: " + e.getMessage());
        return null;
    } catch (SAXException e) {
        System.out.println("Wrong XML file structure: " + e.getMessage());
        return null;
    } catch (IOException e) {
        System.out.println("I/O exeption: " + e.getMessage());
        return null;
    }

    return doc;

}


/** Returns element value
  * @param elem element (it is XML tag)
  * @return Element value otherwise empty String
  */
 public final static String getElementValue( Node elem ) {
     Node kid;
     if( elem != null){
         if (elem.hasChildNodes()){
             for( kid = elem.getFirstChild(); kid != null; kid = kid.getNextSibling() ){
                 if( kid.getNodeType() == Node.TEXT_NODE  ){
                     return kid.getNodeValue();
                 }
             }
         }
     }
     return "";
 }

/*Start Parsing Body */
 public static String getBodyXML(String id){     
        String line = null;
        try {
            DefaultHttpClient httpClient = new DefaultHttpClient();
            HttpPost httpPost = new HttpPost("http://192.168.1.44:9090/solr/core0/select/?q=content_id:"+id+"&version=2.2&start=0&rows=10&indent=on");
            HttpResponse httpResponse = httpClient.execute(httpPost);
            HttpEntity httpEntity = httpResponse.getEntity();
            line = EntityUtils.toString(httpEntity);

        } catch (UnsupportedEncodingException e) {
            line = "<results status=\"error\"><msg>Can't connect to server</msg></results>";
        } catch (MalformedURLException e) {
            line = "<results status=\"error\"><msg>Can't connect to server</msg></results>";
        } catch (IOException e) {
            line = "<results status=\"error\"><msg>Can't connect to server</msg></results>";
        }
        String st= ParseXMLBodyNode(line,"doc");
        return st;

}

public static String ParseXMLBodyNode(String str,String node){
     String xmlRecords = str;
     String results = "";
     String[] result = new String [1];
     StringBuffer sb = new StringBuffer();
     StringBuffer text = new StringBuffer(); 
     try {
         DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
         DocumentBuilder db = dbf.newDocumentBuilder();
         InputSource is = new InputSource();
         is.setCharacterStream(new StringReader(xmlRecords));
         Document doc = db.parse(is);
         NodeList indiatimes1 = doc.getElementsByTagName(node);
         sb.append("<results count=");
         sb.append("\"1\"");
         sb.append(">\r\n");

         for (int i = 0; i < indiatimes1.getLength(); i++) {
            Node node1 = indiatimes1.item(i);
            if (node1.getNodeType() == Node.ELEMENT_NODE) {
                Element element = (Element) node1;
                NodeList nodelist = element.getElementsByTagName("str");
                Element element1 = (Element) nodelist.item(0);
                NodeList title = element1.getChildNodes();
                title.getLength();
                for(int j=0; j<title.getLength();j++){
                    text.append(title.item(j).getNodeValue());
                }
                System.out.print((title.item(0)).getNodeValue());
                sb.append("<result>");
                    sb.append("<news>");
                    String tmpText = html2text(text.toString());
                        //sb.append("<![CDATA[<body>");
                            sb.append(tmpText);
                        //sb.append("</body>]]>");
                    sb.append("</news>");
                sb.append("</result>\r\n");
                result[i] = title.item(0).getNodeValue();
            }
         }
         sb.append("</results>");
     } catch (Exception e) {
         System.out.println("Exception........"+results );
         e.printStackTrace();
     }
     return sb.toString();
 }

 public static String html2text(String html) {

    String pText = Jsoup.clean(html, Whitelist.basic());
    return pText;
}

我的 class 啟動該過程

public class NewsDetails extends ListActivity{

/** Called when the activity is first created. */
@Override
public void onCreate(Bundle savedInstanceState) {
    super.onCreate(savedInstanceState);
    setContentView(R.layout.listplaceholder);
/*}

@Override
protected void onStart() {*/

    super.onStart();
    Intent myIntent = getIntent(); 
    String id = myIntent.getStringExtra("content_id");
    String title = myIntent.getStringExtra("title");

    ArrayList<HashMap<String, String>> mylist = new ArrayList<HashMap<String, String>>();


    String xml = XMLfunctions.getBodyXML(id);
    Document doc = XMLfunctions.XMLfromString(xml);

    int numResults = XMLfunctions.numResults(doc);

    if((numResults <= 0)){
        Toast.makeText(NewsDetails.this, "No Result Found", Toast.LENGTH_LONG).show();  
        finish();
    }

    NodeList nodes = doc.getElementsByTagName("result");

    for (int i = 0; i < nodes.getLength(); i++) {                           
        HashMap<String, String> map = new HashMap<String, String>();    
        map.put("title", title);
        Element e = (Element)nodes.item(i);
        map.put("news", XMLfunctions.getValue(e, "news"));
        mylist.add(map);            
    }       

    ListAdapter adapter = new SimpleAdapter(this, mylist , R.layout.list_item, new String[] { "title", "news" }, new int[] { R.id.item_title, R.id.item_subtitle });

    setListAdapter(adapter);

    final ListView lv = getListView();
    lv.setTextFilterEnabled(true);
}

從 jsoup 轉換后得到的示例 xml

<results count="1">
<result>
    <news>
        <ul><li><p>as part of its growth plan,</p></li><li><p>in a bid to achieve the target</p></li><li><p>it is pointed out that most of ccl's production came from opencast mines and only 2 mt from underground (ug) mines. ccl is now trying to increase the share underground production. the board of ccl has, thus, approved the introduction of continuous mine in chiru ug at a cost of about rs 145 crore to raise this mine's production from 2 mt to 8 mt per annum.</p></li><li><p>mr ritolia said that.</p></li></ul>
    </news>
</result>
</results>

我想提取新聞標簽之間的內容。 This xml is fed to XMLfromString(String xml) function in XMLFunctions class which then returns only "<" and rest of the body is left.

我無法使用 html 標簽獲取正文以提供格式。

一種選擇是使用 XML CDATA 部分作為:

    <result>
        <news><![CDATA[ 
<ul><li><p>as part of its growth plan,</p></li><li><p>in a bid to achieve the target</p></li><li><p>it is pointed out that most of ccl's production came from opencast mines and only 2 mt from underground (ug) mines. ccl is now trying to increase the share underground production. the board of ccl has, thus, approved the introduction of continuous mine in chiru ug at a cost of about rs 145 crore to raise this mine's production from 2 mt to 8 mt per annum.</p></li><li><p>mr ritolia said that.</p></li></ul>
]]>
        </news>
    </result>
    </results>

那么您的解析器將不會將 HTML 標記視為 XML 並允許您訪問元素的原始內容。 另一種選擇是編碼 HTML 標簽,即將所有<轉換為&lt; , >進入&gt; , &進入&amp; 等有關編碼的更多信息,請參見此處

暫無
暫無

聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.

 
粵ICP備18138465號  © 2020-2024 STACKOOM.COM